
def multi_class_classification(data_X,data_Y):
    calculate multi-class classification and return related evaluation metrics

    svc = svm.SVC(C=1, kernel='linear')
    # X_train, X_test, y_train, y_test = train_test_split( data_X, data_Y, test_size=0.4, random_state=0) 
    clf =, data_Y) #svm
    # array = svc.coef_
    # print array
    predicted = cross_val_predict(clf, data_X, data_Y, cv=2)
    print "accuracy",metrics.accuracy_score(data_Y, predicted)
    print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') 
    print "f1 score micro",metrics.f1_score(data_Y, predicted, average='micro') 
    print "precision score",metrics.precision_score(data_Y, predicted, average='macro') 
    print "recall score",metrics.recall_score(data_Y, predicted, average='macro') 
    print "hamming_loss",metrics.hamming_loss(data_Y, predicted)
    print "classification_report", metrics.classification_report(data_Y, predicted)
    print "jaccard_similarity_score", metrics.jaccard_similarity_score(data_Y, predicted)
    # print "log_loss", metrics.log_loss(data_Y, predicted)
    print "zero_one_loss", metrics.zero_one_loss(data_Y, predicted)
    # print "AUC&ROC",metrics.roc_auc_score(data_Y, predicted)
    # print "matthews_corrcoef", metrics.matthews_corrcoef(data_Y, predicted) 
def evaluation_analysis(true_label,predicted): 
    return all metrics results
    print "accuracy",metrics.accuracy_score(true_label, predicted)
    print "f1 score macro",metrics.f1_score(true_label, predicted, average='macro')     
    print "f1 score micro",metrics.f1_score(true_label, predicted, average='micro') 
    print "precision score",metrics.precision_score(true_label, predicted, average='macro') 
    print "recall score",metrics.recall_score(true_label, predicted, average='macro') 
    print "hamming_loss",metrics.hamming_loss(true_label, predicted)
    print "classification_report", metrics.classification_report(true_label, predicted)
    print "jaccard_similarity_score", metrics.jaccard_similarity_score(true_label, predicted)
    print "log_loss", metrics.log_loss(true_label, predicted)
    print "zero_one_loss", metrics.zero_one_loss(true_label, predicted)
    print "AUC&ROC",metrics.roc_auc_score(true_label, predicted)
    print "matthews_corrcoef", metrics.matthews_corrcoef(true_label, predicted) 
def classification_scores(gts, preds, labels):
    accuracy        = metrics.accuracy_score(gts,  preds)
    class_accuracies = []
    for lab in labels: # TODO Fix
        class_accuracies.append(metrics.accuracy_score(gts[gts == lab], preds[gts == lab]))
    class_accuracies = np.array(class_accuracies)

    f1_micro        = metrics.f1_score(gts,        preds, average='micro')
    precision_micro = metrics.precision_score(gts, preds, average='micro')
    recall_micro    = metrics.recall_score(gts,    preds, average='micro')
    f1_macro        = metrics.f1_score(gts,        preds, average='macro')
    precision_macro = metrics.precision_score(gts, preds, average='macro')
    recall_macro    = metrics.recall_score(gts,    preds, average='macro')

    # class wise score
    f1s        = metrics.f1_score(gts,        preds, average=None)
    precisions = metrics.precision_score(gts, preds, average=None)
    recalls    = metrics.recall_score(gts,    preds, average=None)

    confusion = metrics.confusion_matrix(gts,preds, labels=labels)

    #TODO confusion matrix, recall, precision
    return accuracy, f1_micro, precision_micro, recall_micro, f1_macro, precision_macro, recall_macro, confusion, class_accuracies, f1s, precisions, recalls 
def get_all_metrics(model, eval_data, eval_labels, pred_labels):
    fpr, tpr, thresholds_keras = roc_curve(eval_labels, pred_labels)
    auc_ = auc(fpr, tpr)
    print("auc_keras:" + str(auc_))

    score = model.evaluate(eval_data, eval_labels, verbose=0)
    print("Test accuracy: " + str(score[1]))

    precision = precision_score(eval_labels, pred_labels)
    print('Precision score: {0:0.2f}'.format(precision))

    recall = recall_score(eval_labels, pred_labels)
    print('Recall score: {0:0.2f}'.format(recall))

    f1 = f1_score(eval_labels, pred_labels)
    print('F1 score: {0:0.2f}'.format(f1))

    average_precision = average_precision_score(eval_labels, pred_labels)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))

    return auc_, score[1], precision, recall, f1, average_precision, fpr, tpr 
def get_all_metrics_(eval_labels, pred_labels):
    fpr, tpr, thresholds_keras = roc_curve(eval_labels, pred_labels)
    auc_ = auc(fpr, tpr)
    print("auc_keras:" + str(auc_))

    precision = precision_score(eval_labels, pred_labels)
    print('Precision score: {0:0.2f}'.format(precision))

    recall = recall_score(eval_labels, pred_labels)
    print('Recall score: {0:0.2f}'.format(recall))

    f1 = f1_score(eval_labels, pred_labels)
    print('F1 score: {0:0.2f}'.format(f1))

    average_precision = average_precision_score(eval_labels, pred_labels)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))

    return auc_, precision, recall, f1, average_precision, fpr, tpr 
def run_evaluate(self, test):
        """Evaluates performance on test set

            test: dataset that yields tuple of (sentences, relation tags)

            metrics: (dict) metrics["acc"] = 98.4, ...

        y_true, y_pred = [], []
        for data in minibatches(test, self.config.batch_size):
            word_batch, pos1_batch, pos2_batch, pos_batch, y_batch = data
            relations_pred = self.predict_batch(word_batch, pos1_batch, pos2_batch, pos_batch)
            assert len(relations_pred) == len(y_batch)
            y_true += y_batch
            y_pred += relations_pred.tolist()

        acc = accuracy_score(y_true, y_pred)
        p   = precision_score(y_true, y_pred, average='macro')
        r   = recall_score(y_true, y_pred, average='macro')
        f1  = f1_score(y_true, y_pred, average='macro')

        return {"acc":acc, "p":p, "r":r, "f1":f1} 
def evaluate(trueValues, predicted, decimals, note):
	print note
	label = 1
	avg = 'weighted'
	a = accuracy_score(trueValues, predicted)
	p = precision_score(trueValues, predicted, pos_label=label, average=avg)
	r = recall_score(trueValues, predicted, pos_label=label, average=avg)
	avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg)
	fclasses = f1_score(trueValues, predicted, average=None)
	f1c1 = fclasses[0]; f1c2 = fclasses[1]
	fw = (f1c1 + f1c2)/2.0

	print 'accuracy:\t', str(round(a,decimals))
	print 'precision:\t', str(round(p,decimals))
	print 'recall:\t', str(round(r,decimals))
	print 'avg f1:\t', str(round(avg_f1,decimals))
	print 'c1 f1:\t', str(round(f1c1,decimals))
	print 'c2 f1:\t', str(round(f1c2,decimals))
	print 'avg(c1,c2):\t', str(round(fw,decimals))
	print '------------'


# split a parallel or comparable corpus into two parts 
def accuracy(y_true, y_pred):        
    # 计算混淆矩阵
    y = np.zeros(len(y_true))
    y_ = np.zeros(len(y_true))    
    for i in range(len(y_true)): 
        y[i] = np.argmax(y_true[i,:])
        y_[i] = np.argmax(y_pred[i,:])
    cnf_mat = confusion_matrix(y, y_)
    # Acc = 1.0*(cnf_mat[1][1]+cnf_mat[0][0])/len(y_true)
    # Sens = 1.0*cnf_mat[1][1]/(cnf_mat[1][1]+cnf_mat[1][0])
    # Spec = 1.0*cnf_mat[0][0]/(cnf_mat[0][0]+cnf_mat[0][1])
    # # 绘制ROC曲线
    # fpr, tpr, thresholds = roc_curve(y_true[:,0], y_pred[:,0])
    # Auc = auc(fpr, tpr)
    # 计算多分类评价值
    Sens = recall_score(y, y_, average='macro')
    Prec = precision_score(y, y_, average='macro')
    F1 = f1_score(y, y_, average='weighted') 
    Support = precision_recall_fscore_support(y, y_, beta=0.5, average=None)
    return Sens, Prec, F1, cnf_mat 
def test_precision_recall_f_ignored_labels():
    # Test a subset of labels may be requested for PRF
    y_true = [1, 1, 2, 3]
    y_pred = [1, 3, 3, 3]
    y_true_bin = label_binarize(y_true, classes=np.arange(5))
    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
    data = [(y_true, y_pred),
            (y_true_bin, y_pred_bin)]

    for i, (y_true, y_pred) in enumerate(data):
        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
        recall_all = partial(recall_score, y_true, y_pred, labels=None)

        assert_array_almost_equal([.5, 1.], recall_13(average=None))
        assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
        assert_almost_equal((.5 * 2 + 1. * 1) / 3,
        assert_almost_equal(2. / 3, recall_13(average='micro'))

        # ensure the above were meaningful tests:
        for average in ['macro', 'weighted', 'micro']:
def test_recall_warnings():
                       np.array([[1, 1], [1, 1]]),
                       np.array([[0, 0], [0, 0]]),
    with warnings.catch_warnings(record=True) as record:
        recall_score(np.array([[0, 0], [0, 0]]),
                     np.array([[1, 1], [1, 1]]),
                     'Recall is ill-defined and '
                     'being set to 0.0 due to no true samples.')
        recall_score([0, 0], [0, 0])
                     'Recall is ill-defined and '
                     'being set to 0.0 due to no true samples.') 
def test_prf_average_binary_data_non_binary():
    # Error if user does not explicitly set non-binary average mode
    y_true_mc = [1, 2, 3, 3]
    y_pred_mc = [1, 2, 3, 1]
    msg_mc = ("Target is multiclass but average='binary'. Please "
              "choose another average setting, one of ["
              "None, 'micro', 'macro', 'weighted'].")
    y_true_ind = np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])
    y_pred_ind = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
    msg_ind = ("Target is multilabel-indicator but average='binary'. Please "
               "choose another average setting, one of ["
               "None, 'micro', 'macro', 'weighted', 'samples'].")

    for y_true, y_pred, msg in [
        (y_true_mc, y_pred_mc, msg_mc),
        (y_true_ind, y_pred_ind, msg_ind),
        for metric in [precision_score, recall_score, f1_score,
                       partial(fbeta_score, beta=2)]:
            assert_raise_message(ValueError, msg,
                                 metric, y_true, y_pred) 
def test_grid_search_cv_results_multimetric():
    X, y = make_classification(n_samples=50, n_features=4, random_state=42)

    n_splits = 3
    params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]),
              dict(kernel=['poly', ], degree=[1, 2])]

    for iid in (False, True):
        grid_searches = []
        for scoring in ({'accuracy': make_scorer(accuracy_score),
                         'recall': make_scorer(recall_score)},
                        'accuracy', 'recall'):
            grid_search = GridSearchCV(SVC(gamma='scale'), cv=n_splits,
                                       iid=iid, param_grid=params,
                                       scoring=scoring, refit=False)
  , y)
            assert_equal(grid_search.iid, iid)

        compare_cv_results_multimetric_with_single(*grid_searches, iid=iid) 
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert clf.multilabel_
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
def get_score(self, model, texta, textb, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert texta.size(1) == textb.size(1) == len(labels)
        vec_predict = model(texta, textb)
        soft_predict = torch.softmax(vec_predict, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        # print('prob', predict_prob)
        # print('index', predict_index)
        # print('labels', labels)
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro') 
def get_score(self, model, texta, textb, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert texta.size(1) == textb.size(1) == len(labels)
        predict_prob = model(texta, textb)
        # print('predict', predict_prob)
        # print('labels', labels)
        predict_labels =, 0.5)
        predict_labels = predict_labels.view(-1).cpu().data.numpy()
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_labels, labels, average='micro') 
def get_score(self, model, x, y, pos, rel, field_x, field_y, field_pos, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        vec_x = torch.tensor([field_x.stoi[i] for i in x])
        len_vec_x = torch.tensor([len(vec_x)]).to(DEVICE)
        vec_pos = torch.tensor([field_pos.stoi[i] for i in pos])
        vec_rel = torch.tensor([int(x) for x in rel])
        predict_y = model(vec_x.view(-1, 1).to(DEVICE), vec_pos.view(-1, 1).to(DEVICE), vec_rel.view(-1, 1).to(DEVICE),
        true_y = [field_y.stoi[i] for i in y]
        assert len(true_y) == len(predict_y)
        return metric_func(predict_y, true_y, average='micro') 
def get_score(self, model, x, y, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        outputs, dep_graph, actions_done = model(x)
        assert len(actions_done) == len(y)
        predict_y = actions_done
        true_y = y.cpu().view(-1).tolist()
        # print(actions_done, y)
        # print(actions_done)
        # print(true_y)
        return metric_func(predict_y, true_y, average='micro') 
def get_score(self, model, texts, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert texts.size(0) == len(labels)
        vec_predict = model(texts)
        soft_predict = torch.softmax(vec_predict, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        # print('prob', predict_prob)
        # print('index', predict_index)
        # print('labels', labels)
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro') 
def get_score(self, model, texts, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert len(texts) == len(labels)
        vec_predict = model(texts)
        soft_predict = torch.softmax(vec_predict, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        # print('prob', predict_prob)
        # print('index', predict_index)
        # print('labels', labels)
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro') 
def get_score(self, model, src, src_lens, trg, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        output = model(src, src_lens, trg)
        output = output[1:].contiguous()
        output = output.view(-1, output.shape[-1])
        trg = trg.transpose(1, 0)
        trg = trg[1:].contiguous()
        trg = trg.view(-1)
        soft_predict = torch.softmax(output, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        labels = trg.cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro') 
def get_score(self, model, src, src_lens, trg, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        output = model(src, src_lens, trg)
        output = output[1:].contiguous()
        output = output.view(-1, output.shape[-1])
        trg = trg.transpose(1, 0)
        trg = trg[1:].contiguous()
        trg = trg.view(-1)
        soft_predict = torch.softmax(output, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        labels = trg.cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro') 
def add_run(self, y_true, y_pred, domains_test):
        Add a completed run
        :param domains_test:
        :param y_true: true labels
        :param y_pred:  predicted labels
        log.verbose('Adding run.\ny_true: {!s}\ny_pred: {!s}'.format(y_true, y_pred))
        self.ys.append((y_true, y_pred))
        self.y_true = numpy.concatenate((self.y_true, y_true))
        self.y_pred = numpy.concatenate((self.y_pred, y_pred))
        self.cms.append(confusion_matrix(y_true, y_pred))
        self.scores = {'accuracy': [accuracy_score(y_true, y_pred)],
                       'precision': [precision_score(y_true, y_pred)],
                       'recall': [recall_score(y_true, y_pred)],
                       'roc': [roc_auc_score(y_true, y_pred)],
                       'f1': [f1_score(y_true, y_pred)]}
        for i in range(len(y_true)):
            if y_true[i] != y_pred[i]:
                self.missclassified.append((domains_test[i], y_true[i])) 
def test_recall_op(generator_fn, y_true_all, y_pred_all, pos_indices,
    # Precision on the whole dataset
    pr_sk = recall_score(
        y_true_all, y_pred_all, pos_indices, average=average)

    # Create Tensorflow graph
    ds =
        generator_fn, (tf.int32, tf.int32), ([None], [None]))
    y_true, y_pred = ds.make_one_shot_iterator().get_next()
    pr_tf = tf_metrics.recall(y_true, y_pred, 4, pos_indices, average=average)

    with tf.Session() as sess:
        # Initialize and run the update op on each batch
        while True:
            except OutOfRangeError as e:

        # Check final value
        assert np.allclose([0]), pr_sk) 
def calc_metrics(y_true, y_hat, max_steps=1000):
    y_true = np.array(y_true)
    y_hat = np.array(y_hat)
    metrics = {}
    metrics['Logloss'] = float(log_loss(y_true, y_hat))
    metrics['AUC'] = roc_auc_score(y_true, y_hat)
    metrics['F1'] = []
    metrics['Precision'] = []
    metrics['Recall'] = []
    for i in range(1, max_steps):
        threshold = float(i) / max_steps
        y_tmp = y_hat > threshold
        metrics['F1'].append(f1_score(y_true, y_tmp))
        metrics['Precision'].append(precision_score(y_true, y_tmp))
        metrics['Recall'].append(recall_score(y_true, y_tmp))
    max_idx = np.argmax(metrics['F1'])
    metrics['F1'] = metrics['F1'][max_idx]
    metrics['Precision'] = metrics['Precision'][max_idx]
    metrics['Recall'] = metrics['Recall'][max_idx]
    metrics['Threshold'] = float(max_idx + 1) / max_steps
    return metrics 
def evaluate_precision_recall(y, target, labels):
  import sklearn.metrics as metrics
  target = target[:len(y)]
  num_classes = max(target) + 1
  results = []
  for i in range(num_classes):
    class_target = _extract_single_class(i, target)
    class_y = _extract_single_class(i, y)

      'precision': metrics.precision_score(class_target, class_y),
      'recall': metrics.recall_score(class_target, class_y),
      'f1': metrics.f1_score(class_target, class_y),
      'fraction': sum(class_target)/len(target),
      '#of_class': int(sum(class_target)),
      'label': labels[i],
      'label_id': i
      # 'tp': tp
    print('%d/%d' % (i, num_classes), results[-1])
  accuracy = metrics.accuracy_score(target, y)
  return accuracy, results 
def report_evaluation_metrics(y_true, y_pred):
    average_precision = average_precision_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, labels=[0, 1], pos_label=1)
    recall = recall_score(y_true, y_pred, labels=[0, 1], pos_label=1)
    f1 = f1_score(y_true, y_pred, labels=[0, 1], pos_label=1)

    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    print('Precision: {0:0.2f}'.format(precision))
    print('Recall: {0:0.2f}'.format(recall))
    print('F1: {0:0.2f}'.format(f1)) 
def report_metrics(self, threshold):
        for average_strategy in ["micro", "macro"]:
            print("{} average strategy, threshold {}".format(average_strategy, threshold))
            print("precision:\t{}".format(precision_score(self.y_true, self.y_pred, average=average_strategy)))
            print("recall:\t{}".format(recall_score(self.y_true, self.y_pred, average=average_strategy)))
            print("f1:\t{}".format(f1_score(self.y_true, self.y_pred, average=average_strategy))) 
def precision_and_recall(label_gt, label_pred, n_class):
    from sklearn.metrics import precision_score, recall_score
    assert len(label_gt) == len(label_pred)
    precision = np.zeros(n_class, dtype=np.float32)
    recall = np.zeros(n_class, dtype=np.float32)
    img_A = np.array(label_gt, dtype=np.float32).flatten()
    img_B = np.array(label_pred, dtype=np.float32).flatten()
    precision[:] = precision_score(img_A, img_B, average=None, labels=range(n_class))
    recall[:] = recall_score(img_A, img_B, average=None, labels=range(n_class))

    return precision, recall 
def recall_at_threshold(self, y_true, y_scores, threshold):
        Dyanamic threshold recall.
        y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
        return metrics.recall_score(y_true, y_pred) 
def recall_at_k(self, y_true, y_scores, k):
        Dynamic k recall, where 0<k<1.
        y_pred = self.k_predictions(y_scores, k)
        return metrics.recall_score(y_true, y_pred)