Python源码示例:sklearn.metrics.pairwise_distances()
示例1
def test_lof_precomputed(random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X.predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
metric='precomputed', novelty=True)
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D.predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
示例2
def test_simple_example():
"""Test on a simple example.
Puts four points in the input space where the opposite labels points are
next to each other. After transform the samples from the same class
should be next to each other.
"""
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
y = np.array([1, 0, 1, 0])
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
random_state=42)
nca.fit(X, y)
X_t = nca.transform(X)
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
np.array([2, 3, 0, 1]))
示例3
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment == i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(
member_data_points, [centroids[i]], metric="euclidean"
)
squared_distances = distances ** 2
heterogeneity += np.sum(squared_distances)
return heterogeneity
示例4
def get_similarities(query_feats, para_features, top=10, combine_feat_scores="mul"):
"""
Get similarities based on multiple independent queries that are then combined using combine_feat_scores
:param query_feats: Multiple vectorized text queries
:param para_features: Multiple vectorized text paragraphs that will be scored against the queries
:param top: Top N facts to keep
:param combine_feat_scores: The way for combining the multiple scores
:return: Ranked fact ids with scores List[tuple(id, weight)]
"""
scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats] # this is distance - low is better!!!
comb_func = comb_funcs[combine_feat_scores]
smoothing_val = 0.000001
max_val = pow((1 + smoothing_val), 2)
dists = scores_per_feat[0] + smoothing_val
if len(scores_per_feat) > 1:
for i in range(1, len(scores_per_feat)):
dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
sorted_ix = np.argsort(dists).tolist() # this is asc (lowers first), in case of ties, uses the earlier paragraph
return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top]
示例5
def combine_similarities(scores_per_feat, top=10, combine_feat_scores="mul"):
"""
Get similarities based on multiple independent queries that are then combined using combine_feat_scores
:param query_feats: Multiple vectorized text queries
:param para_features: Multiple vectorized text paragraphs that will be scored against the queries
:param top: Top N facts to keep
:param combine_feat_scores: The way for combining the multiple scores
:return: Ranked fact ids with scores List[tuple(id, weight)]
"""
# scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats] # this is distance - low is better!!!
comb_func = comb_funcs[combine_feat_scores]
smoothing_val = 0.000001
max_val = pow((1 + smoothing_val), 2)
dists = scores_per_feat[0] + smoothing_val
if len(scores_per_feat) > 1:
for i in range(1, len(scores_per_feat)):
dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
sorted_ix = np.argsort(dists).tolist() # this is asc (lowers first) ,in case of ties, uses the earlier paragraph
max_val = max(np.max(dists), 1)
return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top]
示例6
def spatial_check(metric):
dist_matrix = pairwise_distances(spatial_data, metric=metric)
# scipy is bad sometimes
if metric == "braycurtis":
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("cosine", "correlation"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
# And because distance between all zero vectors should be zero
dist_matrix[10, 11] = 0.0
dist_matrix[11, 10] = 0.0
dist_function = dist.named_distances[metric]
test_matrix = np.array(
[
[
dist_function(spatial_data[i], spatial_data[j])
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric {}".format(metric),
)
示例7
def binary_check(metric):
dist_matrix = pairwise_distances(binary_data, metric=metric)
if metric in ("jaccard", "dice", "sokalsneath", "yule"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("kulsinski", "russellrao"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
# And because distance between all zero vectors should be zero
dist_matrix[10, 11] = 0.0
dist_matrix[11, 10] = 0.0
dist_function = dist.named_distances[metric]
test_matrix = np.array(
[
[
dist_function(binary_data[i], binary_data[j])
for j in range(binary_data.shape[0])
]
for i in range(binary_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric {}".format(metric),
)
示例8
def test_seuclidean():
v = np.abs(np.random.randn(spatial_data.shape[1]))
dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v)
test_matrix = np.array(
[
[
dist.standardised_euclidean(spatial_data[i], spatial_data[j], v)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric seuclidean",
)
示例9
def test_mahalanobis():
v = np.cov(np.transpose(spatial_data))
dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
test_matrix = np.array(
[
[
dist.mahalanobis(spatial_data[i], spatial_data[j], v)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric mahalanobis",
)
示例10
def sparse_spatial_check(metric, sparse_spatial_data):
# Check that metric is supported for this test, otherwise, fail!
assert (
metric in spdist.sparse_named_distances
), f"{metric} not supported for sparse data"
dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric)
if metric in ("braycurtis", "dice", "sokalsneath", "yule"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("cosine", "correlation", "kulsinski", "russellrao"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
# And because distance between all zero vectors should be zero
dist_matrix[10, 11] = 0.0
dist_matrix[11, 10] = 0.0
run_test_sparse_metric(metric, sparse_spatial_data, dist_matrix)
示例11
def sparse_binary_check(metric, sparse_binary_data):
# Check that metric is supported for this test, otherwise, fail!
assert (
metric in spdist.sparse_named_distances
), f"{metric} not supported for sparse data"
dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric)
if metric in ("jaccard", "dice", "sokalsneath", "yule"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
if metric in ("kulsinski", "russellrao"):
dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
# And because distance between all zero vectors should be zero
dist_matrix[10, 11] = 0.0
dist_matrix[11, 10] = 0.0
run_test_sparse_metric(metric, sparse_binary_data, dist_matrix)
# --------------------
# Spatial Metric Tests
# --------------------
示例12
def test_weighted_minkowski(spatial_data):
v = np.abs(np.random.randn(spatial_data.shape[1]))
dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
test_matrix = np.array(
[
[
dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric weighted_minkowski",
)
示例13
def test_mahalanobis(spatial_data):
v = np.cov(np.transpose(spatial_data))
dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
test_matrix = np.array(
[
[
dist.mahalanobis(spatial_data[i], spatial_data[j], v)
for j in range(spatial_data.shape[0])
]
for i in range(spatial_data.shape[0])
]
)
assert_array_almost_equal(
test_matrix,
dist_matrix,
err_msg="Distances don't match " "for metric mahalanobis",
)
示例14
def gaussian(x, workers=None):
"""Default medial gaussian kernel similarity calculation"""
l1 = pairwise_distances(X=x, metric="l1", n_jobs=workers)
n = l1.shape[0]
med = np.median(
np.lib.stride_tricks.as_strided(
l1, (n - 1, n + 1), (l1.itemsize * (n + 1), l1.itemsize)
)[:, 1:]
)
# prevents division by zero when used on label vectors
med = med if med else 1
gamma = 1.0 / (2 * (med ** 2))
return rbf_kernel(x, gamma=gamma)
# p-value computation
示例15
def _compute_isc(data, metric='median'):
''' Helper function to compute intersubject correlation from observations by subjects array.
Args:
data: (pd.DataFrame, np.array) observations by subjects where isc is computed across subjects
metric: (str) type of association metric ['spearman','pearson','kendall']
Returns:
isc: (float) intersubject correlation coefficient
'''
from nltools.data import Adjacency
similarity = Adjacency(1 - pairwise_distances(data.T, metric='correlation'), matrix_type='similarity')
if metric =='mean':
isc = np.tanh(similarity.r_to_z().mean())
elif metric =='median':
isc = similarity.median()
return isc
示例16
def test_lof_precomputed(algorithm, random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
local_rng = np.random.RandomState(random_state)
X = local_rng.random_sample((10, 4))
Y = local_rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3,
algorithm=algorithm,
novelty=True)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X.predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
metric='precomputed', novelty=True)
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D.predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
示例17
def test_simple_example():
"""Test on a simple example.
Puts four points in the input space where the opposite labels points are
next to each other. After transform the samples from the same class
should be next to each other.
"""
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
y = np.array([1, 0, 1, 0])
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
random_state=42)
nca.fit(X, y)
X_t = nca.transform(X)
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
np.array([2, 3, 0, 1]))
示例18
def test_precomputed_cross_validation():
# Ensure array is split correctly
rng = np.random.RandomState(0)
X = rng.rand(20, 2)
D = pairwise_distances(X, metric='euclidean')
y = rng.randint(3, size=20)
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
metric_score = cross_val_score(Est(algorithm_params={'n_candidates': 5}), X, y)
precomp_score = cross_val_score(Est(metric='precomputed',
algorithm_params={'n_candidates': 5},
),
D, y)
assert_array_equal(metric_score, precomp_score)
示例19
def pairwise_distances(
X: ArrayLike,
Y: ArrayLike,
metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean",
n_jobs: Optional[int] = None,
**kwargs: Any
):
if isinstance(Y, da.Array):
raise TypeError("`Y` must be a numpy array")
chunks = (X.chunks[0], (len(Y),))
return X.map_blocks(
metrics.pairwise_distances,
Y,
dtype=float,
chunks=chunks,
metric=metric,
**kwargs
)
示例20
def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
"""Update min distances given cluster centers.
Args:
cluster_centers: indices of cluster centers
only_new: only calculate distance for newly selected points and update
min_distances.
rest_dist: whether to reset min_distances.
"""
if reset_dist:
self.min_distances = None
if only_new:
cluster_centers = [d for d in cluster_centers
if d not in self.already_selected]
if cluster_centers:
# Update min_distances for all examples given new cluster center.
x = self.features[cluster_centers]
dist = pairwise_distances(self.features, x, metric=self.metric)
if self.min_distances is None:
self.min_distances = np.min(dist, axis=1).reshape(-1,1)
else:
self.min_distances = np.minimum(self.min_distances, dist)
示例21
def _max_representative_samples(self, image_features, candidate_image_features, selection_count):
all_distances = pairwise_distances(image_features, candidate_image_features, metric='euclidean')
selected_sample_indices = []
print('Finding max representative candidates..')
minimum_distances = np.ones((len(image_features))) * float('inf')
for _ in tqdm(range(selection_count)):
current_best_score = float("-inf")
current_best_idx = None
current_minimum_distances = None
for i in range(len(candidate_image_features)):
if i not in selected_sample_indices:
selected_sample_indices.append(i)
tmp_distances = np.minimum(minimum_distances, all_distances[:, i])
tmp_score = np.sum(tmp_distances) * -1
if tmp_score > current_best_score:
current_best_score = tmp_score
current_minimum_distances = tmp_distances
current_best_idx = i
selected_sample_indices.pop()
selected_sample_indices.append(current_best_idx)
minimum_distances = current_minimum_distances
return selected_sample_indices
示例22
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
# Fill in the blank (RHS only)
distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean')
# Compute cluster assignments for each data point:
# Fill in the blank (RHS only)
cluster_assignment = np.argmin(distances_from_centroids, axis=1)
return cluster_assignment
# **Checkpoint**. For the last time, let us check if Step 1 was implemented correctly. With rows 0, 2, 4, and 6 of `tf_idf` as an initial set of centroids, we assign cluster labels to rows 0, 10, 20, ..., and 90 of `tf_idf`. The resulting cluster labels should be `[0, 1, 1, 0, 0, 2, 0, 2, 2, 1]`.
# In[15]:
示例23
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in xrange(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
squared_distances = distances**2
heterogeneity += np.sum(squared_distances)
return heterogeneity
# Let's compute the cluster heterogeneity for the 2-cluster example we've been considering based on our current cluster assignments and centroids.
# In[26]:
示例24
def computer_perf(self, instances):
X = instances.features.get_values()
labels = instances.ground_truth.get_labels()
# For unsupervised projection methods,
# the performance is always computed with the labels
# (not the families).
if hasattr(self.projection.conf, 'multiclass'):
if self.projection.conf.multiclass:
labels = instances.ground_truth.get_families()
unique_labels, label_inds = np.unique(labels, return_inverse=True)
ratio = 0
for li in range(len(unique_labels)):
Xc = X[label_inds == li]
Xnc = X[label_inds != li]
ratio += pairwise_distances(Xc).mean() / \
pairwise_distances(Xc, Xnc).mean()
self.class_separation = ratio / len(unique_labels)
示例25
def compute_distances_to_neighborhood(self, to_explain_X, samples_X):
distances = pairwise_distances(to_explain_X, samples_X, metric='euclidean')
#distance to self will always appear in first column
distances[:,0] = np.zeros(distances.shape[0])
return distances
示例26
def _main(args):
with open(args.results_file, 'rb') as f:
results = pickle.load(f)
features, preds, labels, filenames = results
distances = pairwise_distances(features, metric='cosine', n_jobs=-1)
filenames = np.array([_get_only_filename(item) for item in filenames])
avg_top_one = []
results = []
for i, row in enumerate(distances):
sorted_similarity = np.argsort(row)[1:]
gt = labels[i]
top_one = labels[sorted_similarity[0]]
avg_top_one.append([[top_one, ].count(gt)])
tmp = []
tmp.append(filenames[i])
query_results = filenames[sorted_similarity[:args.num_results]]
_ = [tmp.append(item) for item in query_results]
results.append(tmp)
tmp = []
tmp.append('Query')
if args.num_results == None:
args.num_results = len(features) - 1
_ = [tmp.append('R{}'.format(i + 1)) for i in range(args.num_results)]
results = sorted(results, key=lambda e:e[0])
dframe = DataFrame(results, columns=tmp)
dframe.to_csv(args.output_file, index=False)
print('Precision@1: {}'.format(np.average(avg_top_one)))
return
示例27
def _calc_mean_pd(data, f):
return (pairwise_distances(data[f], metric='cosine').sum() / 2) / (f.sum() ** 2 - f.sum())
示例28
def test_silhouette():
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X_dense = dataset.data
X_csr = csr_matrix(X_dense)
X_dok = sp.dok_matrix(X_dense)
X_lil = sp.lil_matrix(X_dense)
y = dataset.target
for X in [X_dense, X_csr, X_dok, X_lil]:
D = pairwise_distances(X, metric='euclidean')
# Given that the actual labels are used, we can assume that S would be
# positive.
score_precomputed = silhouette_score(D, y, metric='precomputed')
assert_greater(score_precomputed, 0)
# Test without calculating D
score_euclidean = silhouette_score(X, y, metric='euclidean')
pytest.approx(score_precomputed, score_euclidean)
if X is X_dense:
score_dense_without_sampling = score_precomputed
else:
pytest.approx(score_euclidean,
score_dense_without_sampling)
# Test with sampling
score_precomputed = silhouette_score(D, y, metric='precomputed',
sample_size=int(X.shape[0] / 2),
random_state=0)
score_euclidean = silhouette_score(X, y, metric='euclidean',
sample_size=int(X.shape[0] / 2),
random_state=0)
assert_greater(score_precomputed, 0)
assert_greater(score_euclidean, 0)
pytest.approx(score_euclidean, score_precomputed)
if X is X_dense:
score_dense_with_sampling = score_precomputed
else:
pytest.approx(score_euclidean, score_dense_with_sampling)
示例29
def test_spectral_unknown_mode():
# Test that SpectralClustering fails with an unknown mode set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
random_state=0, eigen_solver="<unknown>")
示例30
def test_spectral_unknown_assign_labels():
# Test that SpectralClustering fails with an unknown assign_labels set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
random_state=0, assign_labels="<unknown>")