Spaces:

Draken007
/

geochatbot

Runtime error

App Files Files Community

geochatbot / llm /Lib /site-packages /faiss /extra_wrappers.py

Draken007

Upload 7228 files

2a0bc63 verified over 1 year ago

raw

history blame contribute delete

21 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	# @nolint

	# not linting this file because it imports * from swigfaiss, which
	# causes a ton of useless warnings.

	import numpy as np

	from faiss.loader import *

	import faiss

	import collections.abc


	###########################################
	# Wrapper for a few functions
	###########################################


	def kmin(array, k):
	"""return k smallest values (and their indices) of the lines of a
	float32 array"""
	array = np.ascontiguousarray(array, dtype='float32')
	m, n = array.shape
	I = np.zeros((m, k), dtype='int64')
	D = np.zeros((m, k), dtype='float32')
	ha = faiss.float_maxheap_array_t()
	ha.ids = swig_ptr(I)
	ha.val = swig_ptr(D)
	ha.nh = m
	ha.k = k
	ha.heapify()
	ha.addn(n, swig_ptr(array))
	ha.reorder()
	return D, I


	def kmax(array, k):
	"""return k largest values (and their indices) of the lines of a
	float32 array"""
	array = np.ascontiguousarray(array, dtype='float32')
	m, n = array.shape
	I = np.zeros((m, k), dtype='int64')
	D = np.zeros((m, k), dtype='float32')
	ha = faiss.float_minheap_array_t()
	ha.ids = swig_ptr(I)
	ha.val = swig_ptr(D)
	ha.nh = m
	ha.k = k
	ha.heapify()
	ha.addn(n, swig_ptr(array))
	ha.reorder()
	return D, I


	def pairwise_distances(xq, xb, metric=METRIC_L2, metric_arg=0):
	"""compute the whole pairwise distance matrix between two sets of
	vectors"""
	xq = np.ascontiguousarray(xq, dtype='float32')
	xb = np.ascontiguousarray(xb, dtype='float32')
	nq, d = xq.shape
	nb, d2 = xb.shape
	assert d == d2
	dis = np.empty((nq, nb), dtype='float32')
	if metric == METRIC_L2:
	pairwise_L2sqr(
	d, nq, swig_ptr(xq),
	nb, swig_ptr(xb),
	swig_ptr(dis))
	elif metric == METRIC_INNER_PRODUCT:
	dis[:] = xq @ xb.T
	else:
	pairwise_extra_distances(
	d, nq, swig_ptr(xq),
	nb, swig_ptr(xb),
	metric, metric_arg,
	swig_ptr(dis))
	return dis


	def rand(n, seed=12345):
	res = np.empty(n, dtype='float32')
	float_rand(swig_ptr(res), res.size, seed)
	return res


	def randint(n, seed=12345, vmax=None):
	res = np.empty(n, dtype='int64')
	if vmax is None:
	int64_rand(swig_ptr(res), res.size, seed)
	else:
	int64_rand_max(swig_ptr(res), res.size, vmax, seed)
	return res


	lrand = randint


	def randn(n, seed=12345):
	res = np.empty(n, dtype='float32')
	float_randn(swig_ptr(res), res.size, seed)
	return res


	def checksum(a):
	""" compute a checksum for quick-and-dirty comparisons of arrays """
	a = a.view('uint8')
	if a.ndim == 1:
	return bvec_checksum(a.size, swig_ptr(a))
	n, d = a.shape
	cs = np.zeros(n, dtype='uint64')
	bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
	return cs

	rand_smooth_vectors_c = rand_smooth_vectors

	def rand_smooth_vectors(n, d, seed=1234):
	res = np.empty((n, d), dtype='float32')
	rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
	return res


	def eval_intersection(I1, I2):
	""" size of intersection between each line of two result tables"""
	I1 = np.ascontiguousarray(I1, dtype='int64')
	I2 = np.ascontiguousarray(I2, dtype='int64')
	n = I1.shape[0]
	assert I2.shape[0] == n
	k1, k2 = I1.shape[1], I2.shape[1]
	ninter = 0
	for i in range(n):
	ninter += ranklist_intersection_size(
	k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
	return ninter


	def normalize_L2(x):
	fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))

	bucket_sort_c = bucket_sort

	def bucket_sort(tab, nbucket=None, nt=0):
	"""Perform a bucket sort on a table of integers.

	Parameters
	----------
	tab : array_like
	elements to sort, max value nbucket - 1
	nbucket : integer
	number of buckets, None if unknown
	nt : integer
	number of threads to use (0 = use unthreaded codepath)

	Returns
	-------
	lims : array_like
	cumulative sum of bucket sizes (size vmax + 1)
	perm : array_like
	perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
	"""
	tab = np.ascontiguousarray(tab, dtype="int64")
	if nbucket is None:
	nbucket = int(tab.max() + 1)
	lims = np.empty(nbucket + 1, dtype='int64')
	perm = np.empty(tab.size, dtype='int64')
	bucket_sort_c(
	tab.size, faiss.swig_ptr(tab.view('uint64')),
	nbucket, faiss.swig_ptr(lims), faiss.swig_ptr(perm),
	nt
	)
	return lims, perm

	matrix_bucket_sort_inplace_c = matrix_bucket_sort_inplace

	def matrix_bucket_sort_inplace(tab, nbucket=None, nt=0):
	"""Perform a bucket sort on a matrix, recording the original
	row of each element.

	Parameters
	----------
	tab : array_like
	array of size (N, ncol) that contains the bucket ids, maximum
	value nbucket - 1.
	On output, it the elements are shuffled such that the flat array
	tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
	of each bucket entry.
	nbucket : integer
	number of buckets (the maximum value in tab should be nbucket - 1)
	nt : integer
	number of threads to use (0 = use unthreaded codepath)

	Returns
	-------
	lims : array_like
	cumulative sum of bucket sizes (size vmax + 1)
	"""
	assert tab.dtype == 'int32' or tab.dtype == 'int64'
	nrow, ncol = tab.shape
	if nbucket is None:
	nbucket = int(tab.max() + 1)
	lims = np.empty(nbucket + 1, dtype='int64')
	matrix_bucket_sort_inplace_c(
	nrow, ncol, faiss.swig_ptr(tab),
	nbucket, faiss.swig_ptr(lims),
	nt
	)
	return lims


	###########################################
	# ResultHeap
	###########################################

	class ResultHeap:
	"""Accumulate query results from a sliced dataset. The final result will
	be in self.D, self.I."""

	def __init__(self, nq, k, keep_max=False):
	"""
	nq: number of query vectors,
	k: number of results per query
	keep_max: keep the top-k maximum values instead of the minima
	"""
	self.I = np.zeros((nq, k), dtype='int64')
	self.D = np.zeros((nq, k), dtype='float32')
	self.nq, self.k = nq, k
	if keep_max:
	heaps = float_minheap_array_t()
	else:
	heaps = float_maxheap_array_t()
	heaps.k = k
	heaps.nh = nq
	heaps.val = swig_ptr(self.D)
	heaps.ids = swig_ptr(self.I)
	heaps.heapify()
	self.heaps = heaps

	def add_result(self, D, I):
	"""
	Add results for all heaps
	D, I should be of size (nh, nres)
	D, I do not need to be in a particular order (heap or sorted)
	"""
	nq, kd = D.shape
	D = np.ascontiguousarray(D, dtype='float32')
	I = np.ascontiguousarray(I, dtype='int64')
	assert I.shape == (nq, kd)
	assert nq == self.nq
	self.heaps.addn_with_ids(
	kd, swig_ptr(D),
	swig_ptr(I), kd)

	def add_result_subset(self, subset, D, I):
	"""
	Add results for a subset of heaps.
	D, I should hold resutls for all the subset
	as a special case, if I is 1D, then all ids are assumed to be the same
	"""
	nsubset, kd = D.shape
	assert nsubset == len(subset)
	assert (
	I.ndim == 2 and D.shape == I.shape or
	I.ndim == 1 and I.shape == (kd, )
	)
	D = np.ascontiguousarray(D, dtype='float32')
	I = np.ascontiguousarray(I, dtype='int64')
	subset = np.ascontiguousarray(subset, dtype='int64')
	id_stride = 0 if I.ndim == 1 else kd
	self.heaps.addn_query_subset_with_ids(
	nsubset, swig_ptr(subset),
	kd, swig_ptr(D), swig_ptr(I), id_stride
	)

	def finalize(self):
	self.heaps.reorder()


	def merge_knn_results(Dall, Iall, keep_max=False):
	"""
	Merge a set of sorted knn-results obtained from different shards in a dataset
	Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
	returns D, I of size (nq, k) as the merged result set
	"""
	assert Iall.shape == Dall.shape
	nshard, n, k = Dall.shape
	Dnew = np.empty((n, k), dtype=Dall.dtype)
	Inew = np.empty((n, k), dtype=Iall.dtype)
	func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin
	func(
	n, k, nshard,
	swig_ptr(Dall), swig_ptr(Iall),
	swig_ptr(Dnew), swig_ptr(Inew)
	)
	return Dnew, Inew

	######################################################
	# Efficient ID to ID map
	######################################################

	class MapInt64ToInt64:

	def __init__(self, capacity):
	self.log2_capacity = int(np.log2(capacity))
	assert capacity == 2 ** self.log2_capacity, "need power of 2 capacity"
	self.capacity = capacity
	self.tab = np.empty((capacity, 2), dtype='int64')
	faiss.hashtable_int64_to_int64_init(self.log2_capacity, swig_ptr(self.tab))

	def add(self, keys, vals):
	n, = keys.shape
	assert vals.shape == (n,)
	faiss.hashtable_int64_to_int64_add(
	self.log2_capacity, swig_ptr(self.tab),
	n, swig_ptr(keys), swig_ptr(vals))

	def lookup(self, keys):
	n, = keys.shape
	vals = np.empty((n,), dtype='int64')
	faiss.hashtable_int64_to_int64_lookup(
	self.log2_capacity, swig_ptr(self.tab),
	n, swig_ptr(keys), swig_ptr(vals))
	return vals

	######################################################
	# KNN function
	######################################################

	def knn(xq, xb, k, metric=METRIC_L2):
	"""
	Compute the k nearest neighbors of a vector without constructing an index


	Parameters
	----------
	xq : array_like
	Query vectors, shape (nq, d) where the dimension d is that same as xb
	`dtype` must be float32.
	xb : array_like
	Database vectors, shape (nb, d) where dimension d is the same as xq
	`dtype` must be float32.
	k : int
	Number of nearest neighbors.
	distance_type : MetricType, optional
	distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)

	Returns
	-------
	D : array_like
	Distances of the nearest neighbors, shape (nq, k)
	I : array_like
	Labels of the nearest neighbors, shape (nq, k)
	"""
	xq = np.ascontiguousarray(xq, dtype='float32')
	xb = np.ascontiguousarray(xb, dtype='float32')
	nq, d = xq.shape
	nb, d2 = xb.shape
	assert d == d2

	I = np.empty((nq, k), dtype='int64')
	D = np.empty((nq, k), dtype='float32')

	if metric == METRIC_L2:
	knn_L2sqr(
	swig_ptr(xq), swig_ptr(xb),
	d, nq, nb, k, swig_ptr(D), swig_ptr(I)
	)
	elif metric == METRIC_INNER_PRODUCT:
	knn_inner_product(
	swig_ptr(xq), swig_ptr(xb),
	d, nq, nb, k, swig_ptr(D), swig_ptr(I)
	)
	else:
	raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
	return D, I

	def knn_hamming(xq, xb, k, variant="hc"):
	"""
	Compute the k nearest neighbors of a set of vectors without constructing an index.

	Parameters
	----------
	xq : array_like
	Query vectors, shape (nq, d) where d is the number of bits / 8
	`dtype` must be uint8.
	xb : array_like
	Database vectors, shape (nb, d) where d is the number of bits / 8
	`dtype` must be uint8.
	k : int
	Number of nearest neighbors.
	variant : string
	Function variant to use, either "mc" (counter) or "hc" (heap)

	Returns
	-------
	D : array_like
	Distances of the nearest neighbors, shape (nq, k)
	I : array_like
	Labels of the nearest neighbors, shape (nq, k)
	"""
	# other variant is "mc"
	nq, d = xq.shape
	nb, d2 = xb.shape
	assert d == d2
	D = np.empty((nq, k), dtype='int32')
	I = np.empty((nq, k), dtype='int64')

	if variant == "hc":
	heap = faiss.int_maxheap_array_t()
	heap.k = k
	heap.nh = nq
	heap.ids = faiss.swig_ptr(I)
	heap.val = faiss.swig_ptr(D)
	faiss.hammings_knn_hc(
	heap, faiss.swig_ptr(xq), faiss.swig_ptr(xb), nb,
	d, 1
	)
	elif variant == "mc":
	faiss.hammings_knn_mc(
	faiss.swig_ptr(xq), faiss.swig_ptr(xb), nq, nb, k, d,
	faiss.swig_ptr(D), faiss.swig_ptr(I)
	)
	else:
	raise NotImplementedError
	return D, I


	###########################################
	# Kmeans object
	###########################################


	class Kmeans:
	"""Object that performs k-means clustering and manages the centroids.
	The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.

	Parameters
	----------
	d : int
	dimension of the vectors to cluster
	k : int
	number of clusters
	gpu: bool or int, optional
	False: don't use GPU
	True: use all GPUs
	number: use this many GPUs
	progressive_dim_steps:
	use a progressive dimension clustering (with that number of steps)

	Subsequent parameters are fields of the Clustring object. The most important are:

	niter: int, optional
	clustering iterations
	nredo: int, optional
	redo clustering this many times and keep best
	verbose: bool, optional
	spherical: bool, optional
	do we want normalized centroids?
	int_centroids: bool, optional
	round centroids coordinates to integer
	seed: int, optional
	seed for the random number generator

	"""

	def __init__(self, d, k, **kwargs):
	"""d: input dimension, k: nb of centroids. Additional
	parameters are passed on the ClusteringParameters object,
	including niter=25, verbose=False, spherical = False
	"""
	self.d = d
	self.reset(k)
	self.gpu = False
	if "progressive_dim_steps" in kwargs:
	self.cp = ProgressiveDimClusteringParameters()
	else:
	self.cp = ClusteringParameters()
	for k, v in kwargs.items():
	if k == 'gpu':
	if v == True or v == -1:
	v = get_num_gpus()
	self.gpu = v
	else:
	# if this raises an exception, it means that it is a non-existent field
	getattr(self.cp, k)
	setattr(self.cp, k, v)
	self.set_index()

	def set_index(self):
	d = self.d
	if self.cp.__class__ == ClusteringParameters:
	if self.cp.spherical:
	self.index = IndexFlatIP(d)
	else:
	self.index = IndexFlatL2(d)
	if self.gpu:
	self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
	else:
	if self.gpu:
	fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
	else:
	fac = ProgressiveDimIndexFactory()
	self.fac = fac

	def reset(self, k=None):
	""" prepare k-means object to perform a new clustering, possibly
	with another number of centroids """
	if k is not None:
	self.k = int(k)
	self.centroids = None
	self.obj = None
	self.iteration_stats = None

	def train(self, x, weights=None, init_centroids=None):
	""" Perform k-means clustering.
	On output of the function call:

	- the centroids are in the centroids field of size (`k`, `d`).

	- the objective value at each iteration is in the array obj (size `niter`)

	- detailed optimization statistics are in the array iteration_stats.

	Parameters
	----------
	x : array_like
	Training vectors, shape (n, d), `dtype` must be float32 and n should
	be larger than the number of clusters `k`.
	weights : array_like
	weight associated to each vector, shape `n`
	init_centroids : array_like
	initial set of centroids, shape (n, d)

	Returns
	-------
	final_obj: float
	final optimization objective

	"""
	x = np.ascontiguousarray(x, dtype='float32')
	n, d = x.shape
	assert d == self.d

	if self.cp.__class__ == ClusteringParameters:
	# regular clustering
	clus = Clustering(d, self.k, self.cp)
	if init_centroids is not None:
	nc, d2 = init_centroids.shape
	assert d2 == d
	faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
	clus.train(x, self.index, weights)
	else:
	# not supported for progressive dim
	assert weights is None
	assert init_centroids is None
	assert not self.cp.spherical
	clus = ProgressiveDimClustering(d, self.k, self.cp)
	clus.train(n, swig_ptr(x), self.fac)

	centroids = faiss.vector_float_to_array(clus.centroids)

	self.centroids = centroids.reshape(self.k, d)
	stats = clus.iteration_stats
	stats = [stats.at(i) for i in range(stats.size())]
	self.obj = np.array([st.obj for st in stats])
	# copy all the iteration_stats objects to a python array
	stat_fields = 'obj time time_search imbalance_factor nsplit'.split()
	self.iteration_stats = [
	{field: getattr(st, field) for field in stat_fields}
	for st in stats
	]
	return self.obj[-1] if self.obj.size > 0 else 0.0

	def assign(self, x):
	x = np.ascontiguousarray(x, dtype='float32')
	assert self.centroids is not None, "should train before assigning"
	self.index.reset()
	self.index.add(self.centroids)
	D, I = self.index.search(x, 1)
	return D.ravel(), I.ravel()


	###########################################
	# Packing and unpacking bistrings
	###########################################

	def is_sequence(x):
	return isinstance(x, collections.abc.Sequence)

	pack_bitstrings_c = pack_bitstrings

	def pack_bitstrings(a, nbit):
	"""
	Pack a set integers (i, j) where i=0:n and j=0:M into
	n bitstrings.
	Output is an uint8 array of size (n, code_size), where code_size is
	such that at most 7 bits per code are wasted.

	If nbit is an integer: all entries takes nbit bits.
	If nbit is an array: entry (i, j) takes nbit[j] bits.
	"""
	n, M = a.shape
	a = np.ascontiguousarray(a, dtype='int32')
	if is_sequence(nbit):
	nbit = np.ascontiguousarray(nbit, dtype='int32')
	assert nbit.shape == (M,)
	code_size = int((nbit.sum() + 7) // 8)
	b = np.empty((n, code_size), dtype='uint8')
	pack_bitstrings_c(
	n, M, swig_ptr(nbit), swig_ptr(a), swig_ptr(b), code_size)
	else:
	code_size = (M * nbit + 7) // 8
	b = np.empty((n, code_size), dtype='uint8')
	pack_bitstrings_c(n, M, nbit, swig_ptr(a), swig_ptr(b), code_size)
	return b

	unpack_bitstrings_c = unpack_bitstrings

	def unpack_bitstrings(b, M_or_nbits, nbit=None):
	"""
	Unpack a set integers (i, j) where i=0:n and j=0:M from
	n bitstrings (encoded as uint8s).
	Input is an uint8 array of size (n, code_size), where code_size is
	such that at most 7 bits per code are wasted.

	Two forms:
	- when called with (array, M, nbit): there are M entries of size
	nbit per row
	- when called with (array, nbits): element (i, j) is encoded in
	nbits[j] bits
	"""
	n, code_size = b.shape
	if nbit is None:
	nbit = np.ascontiguousarray(M_or_nbits, dtype='int32')
	M = len(nbit)
	min_code_size = int((nbit.sum() + 7) // 8)
	assert code_size >= min_code_size
	a = np.empty((n, M), dtype='int32')
	unpack_bitstrings_c(
	n, M, swig_ptr(nbit),
	swig_ptr(b), code_size, swig_ptr(a))
	else:
	M = M_or_nbits
	min_code_size = (M * nbit + 7) // 8
	assert code_size >= min_code_size
	a = np.empty((n, M), dtype='int32')
	unpack_bitstrings_c(
	n, M, nbit, swig_ptr(b), code_size, swig_ptr(a))
	return a