Java源码示例:weka.filters.unsupervised.attribute.ReplaceMissingValues
示例1
/**
* performs initialization of members
*/
@Override
protected void initializeMembers() {
super.initializeMembers();
m_KNNdetermined = -1;
m_NeighborsTestset = null;
m_TrainsetNew = null;
m_TestsetNew = null;
m_UseNaiveSearch = false;
m_LabeledTestset = null;
m_Missing = new ReplaceMissingValues();
m_Classifier = new IBk();
m_Classifier.setKNN(10);
m_Classifier.setCrossValidate(true);
m_Classifier.setWindowSize(0);
m_Classifier.setMeanSquared(false);
m_KNN = m_Classifier.getKNN();
m_AdditionalMeasures.add("measureDeterminedKNN");
}
示例2
/**
* Builds the classifier.
*
* @param data the data to train with
* @throws Exception if classifier can't be built successfully
*/
public void buildClassifier(Instances data) throws Exception{
// can classifier handle the data?
getCapabilities().testWithFail(data);
// remove instances with missing class
Instances filteredData = new Instances(data);
filteredData.deleteWithMissingClass();
//replace missing values
m_replaceMissing = new ReplaceMissingValues();
m_replaceMissing.setInputFormat(filteredData);
filteredData = Filter.useFilter(filteredData, m_replaceMissing);
//possibly convert nominal attributes globally
if (m_convertNominal) {
m_nominalToBinary = new NominalToBinary();
m_nominalToBinary.setInputFormat(filteredData);
filteredData = Filter.useFilter(filteredData, m_nominalToBinary);
}
int minNumInstances = 2;
//create ModelSelection object, either for splits on the residuals or for splits on the class value
ModelSelection modSelection;
if (m_splitOnResiduals) {
modSelection = new ResidualModelSelection(minNumInstances);
} else {
modSelection = new C45ModelSelection(minNumInstances, filteredData, true);
}
//create tree root
m_tree = new LMTNode(modSelection, m_numBoostingIterations, m_fastRegression,
m_errorOnProbabilities, m_minNumInstances, m_weightTrimBeta, m_useAIC);
//build tree
m_tree.buildClassifier(filteredData);
if (modSelection instanceof C45ModelSelection) ((C45ModelSelection)modSelection).cleanup();
}
示例3
/**
* Cleans up data
*
* @param data data to be cleaned up
* @throws Exception if an error occurs
*/
private void cleanUpData(Instances data)throws Exception{
m_Data = data;
m_TransformFilter = new NominalToBinary();
m_TransformFilter.setInputFormat(m_Data);
m_Data = Filter.useFilter(m_Data, m_TransformFilter);
m_MissingFilter = new ReplaceMissingValues();
m_MissingFilter.setInputFormat(m_Data);
m_Data = Filter.useFilter(m_Data, m_MissingFilter);
m_Data.deleteWithMissingClass();
}
示例4
/**
* default constructor
*/
public PLSFilter() {
super();
// setup pre-processing
m_Missing = new ReplaceMissingValues();
m_Filter = new Center();
}
示例5
public static Instances estimateMissing(Instances data){
ReplaceMissingValues nb = new ReplaceMissingValues();
Instances nd=null;
try{
nb.setInputFormat(data);
Instance temp;
int n = data.numInstances();
for(int i=0;i<n;i++)
nb.input(data.instance(i));
System.out.println(" Instances input");
System.out.println(" Output format retrieved");
// nd=Filter.useFilter(data,nb);
// System.out.println(" Filtered? num atts = "+nd.numAttributes()+" num inst = "+nd.numInstances()+" filter = "+nb);
if(nb.batchFinished())
System.out.println(" batch finished ");
nd=nb.getOutputFormat();
for(int i=0;i<n;i++)
{
temp=nb.output();
// System.out.println(temp);
nd.add(temp);
}
}catch(Exception e)
{
System.out.println("Error in estimateMissing = "+e.toString());
nd=data;
System.exit(0);
}
return nd;
}
示例6
/**
* performs initialization of members
*/
@Override
protected void initializeMembers() {
super.initializeMembers();
m_Filter = new ReplaceMissingValues();
m_Classifier = new YATSI();
m_TrainsetNew = null;
m_TestsetNew = null;
}
示例7
/**
* Builds a clusterer for a set of instances.
*
* @param data the instances to train the clusterer with
* @throws Exception if the clusterer hasn't been set or something goes wrong
*/
public void buildClusterer(Instances data) throws Exception {
// can clusterer handle the data?
getCapabilities().testWithFail(data);
m_replaceMissing = new ReplaceMissingValues();
m_replaceMissing.setInputFormat(data);
data = weka.filters.Filter.useFilter(data, m_replaceMissing);
m_theInstances = new Instances(data, 0);
if (m_wrappedClusterer == null) {
throw new Exception("No clusterer has been set");
}
m_wrappedClusterer.buildClusterer(data);
m_model =
new DiscreteEstimator[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
m_modelNormal =
new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()][2];
double[][] weights = new double[m_wrappedClusterer.numberOfClusters()][data.numAttributes()];
m_priors = new double[m_wrappedClusterer.numberOfClusters()];
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
m_priors[i] = 1.0; // laplace correction
for (int j = 0; j < data.numAttributes(); j++) {
if (data.attribute(j).isNominal()) {
m_model[i][j] = new DiscreteEstimator(data.attribute(j).numValues(),
true);
}
}
}
Instance inst = null;
// Compute mean, etc.
int[] clusterIndex = new int[data.numInstances()];
for (int i = 0; i < data.numInstances(); i++) {
inst = data.instance(i);
int cluster = m_wrappedClusterer.clusterInstance(inst);
m_priors[cluster] += inst.weight();
for (int j = 0; j < data.numAttributes(); j++) {
if (!inst.isMissing(j)) {
if (data.attribute(j).isNominal()) {
m_model[cluster][j].addValue(inst.value(j),inst.weight());
} else {
m_modelNormal[cluster][j][0] += inst.weight() * inst.value(j);
weights[cluster][j] += inst.weight();
}
}
}
clusterIndex[i] = cluster;
}
for (int j = 0; j < data.numAttributes(); j++) {
if (data.attribute(j).isNumeric()) {
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
if (weights[i][j] > 0) {
m_modelNormal[i][j][0] /= weights[i][j];
}
}
}
}
// Compute standard deviations
for (int i = 0; i < data.numInstances(); i++) {
inst = data.instance(i);
for (int j = 0; j < data.numAttributes(); j++) {
if (!inst.isMissing(j)) {
if (data.attribute(j).isNumeric()) {
double diff = m_modelNormal[clusterIndex[i]][j][0] - inst.value(j);
m_modelNormal[clusterIndex[i]][j][1] += inst.weight() * diff * diff;
}
}
}
}
for (int j = 0; j < data.numAttributes(); j++) {
if (data.attribute(j).isNumeric()) {
for (int i = 0; i < m_wrappedClusterer.numberOfClusters(); i++) {
if (weights[i][j] > 0) {
m_modelNormal[i][j][1] =
Math.sqrt(m_modelNormal[i][j][1] / weights[i][j]);
} else if (weights[i][j] <= 0) {
m_modelNormal[i][j][1] = Double.MAX_VALUE;
}
if (m_modelNormal[i][j][1] <= m_minStdDev) {
m_modelNormal[i][j][1] = data.attributeStats(j).numericStats.stdDev;
if (m_modelNormal[i][j][1] <= m_minStdDev) {
m_modelNormal[i][j][1] = m_minStdDev;
}
}
}
}
}
Utils.normalize(m_priors);
}
示例8
/**
* Builds the classifier.
*
* @param data the data to train with
* @throws Exception if classifier can't be built successfully
*/
public void buildClassifier(Instances data) throws Exception{
// can classifier handle the data?
getCapabilities().testWithFail(data);
// remove instances with missing class
Instances filteredData = new Instances(data);
filteredData.deleteWithMissingClass();
//replace missing values
m_replaceMissing = new ReplaceMissingValues();
m_replaceMissing.setInputFormat(filteredData);
filteredData = Filter.useFilter(filteredData, m_replaceMissing);
//possibly convert nominal attributes globally
if (m_convertNominal) {
m_nominalToBinary = new NominalToBinary();
m_nominalToBinary.setInputFormat(filteredData);
filteredData = Filter.useFilter(filteredData, m_nominalToBinary);
}
int minNumInstances = 2;
//create a FT tree root
if (m_modelType==0)
m_tree = new FTNode( m_errorOnProbabilities, m_numBoostingIterations, m_minNumInstances,
m_weightTrimBeta, m_useAIC);
//create a FTLeaves tree root
if (m_modelType==1){
m_tree = new FTLeavesNode(m_errorOnProbabilities, m_numBoostingIterations, m_minNumInstances,
m_weightTrimBeta, m_useAIC);
}
//create a FTInner tree root
if (m_modelType==2)
m_tree = new FTInnerNode(m_errorOnProbabilities, m_numBoostingIterations, m_minNumInstances,
m_weightTrimBeta, m_useAIC);
//build tree
m_tree.buildClassifier(filteredData);
// prune tree
m_tree.prune();
m_tree.assignIDs(0);
m_tree.cleanup();
}
示例9
/**
* convert models from old weka version
*
* @param model
*/
public static void fixOldModelVersion(final OrbitModel model) {
if (model == null) return; // nothing to fix
boolean oldWekaVersion = false;
try {
model.getStructure().classAttribute().numValues();
} catch (NullPointerException ne) {
oldWekaVersion = true;
}
// apply old model fix?
if (oldWekaVersion) {
logger.info("model from old weka version (< 3.7.11) detected, trying to apply fixes");
int numClasses = model.getClassShapes().size();
TissueFeatures tf = new TissueFeatures(model.getFeatureDescription(), null);
int numFeatures = tf.getFeaturesPerSample() * model.getFeatureDescription().getSampleSize() + 1;
ArrayList<Attribute> attrInfo = new ArrayList<Attribute>(numFeatures);
for (int a = 0; a < numFeatures - 1; a++) {
Attribute attr = new Attribute("a" + a);
attrInfo.add(attr);
}
List<String> classValues = new ArrayList<String>(numClasses);
for (int i = 0; i < numClasses; i++) {
classValues.add((i + 1) + ".0"); // "1.0", "2.0", ...
}
Attribute classAttr = new Attribute("class", classValues);
attrInfo.add(classAttr);
Instances structure = new Instances("trainSet pattern classes", attrInfo, 0);
structure.setClassIndex(numFeatures - 1);
model.setStructure(structure);
try {
if (model.getClassifier() != null && model.getClassifier().getClassifier() != null && model.getClassifier().getClassifier() instanceof SMO) {
SMO smo = ((SMO) model.getClassifier().getClassifier());
Field field = smo.getClass().getDeclaredField("m_classAttribute");
field.setAccessible(true);
field.set(smo, classAttr);
// missing values
ReplaceMissingValues rmv = new ReplaceMissingValues();
rmv.setInputFormat(structure);
Field missing = smo.getClass().getDeclaredField("m_Missing");
missing.setAccessible(true);
missing.set(smo, rmv);
// filter
Field filter = smo.getClass().getDeclaredField("m_Filter");
filter.setAccessible(true);
Filter normalize = (Filter) filter.get(smo);
RelationalLocator relLoc = new RelationalLocator(structure);
StringLocator strLoc = new StringLocator(structure);
Field outputRelAtts = normalize.getClass().getSuperclass().getSuperclass().getDeclaredField("m_OutputRelAtts");
outputRelAtts.setAccessible(true);
outputRelAtts.set(normalize, relLoc);
Field inputRelAtts = normalize.getClass().getSuperclass().getSuperclass().getDeclaredField("m_InputRelAtts");
inputRelAtts.setAccessible(true);
inputRelAtts.set(normalize, relLoc);
Field outputStrAtts = normalize.getClass().getSuperclass().getSuperclass().getDeclaredField("m_OutputStringAtts");
outputStrAtts.setAccessible(true);
outputStrAtts.set(normalize, strLoc);
Field inputStrAtts = normalize.getClass().getSuperclass().getSuperclass().getDeclaredField("m_InputStringAtts");
inputStrAtts.setAccessible(true);
inputStrAtts.set(normalize, strLoc);
Field outputFormat = normalize.getClass().getSuperclass().getSuperclass().getDeclaredField("m_OutputFormat");
outputFormat.setAccessible(true);
outputFormat.set(normalize, structure);
logger.info("fixes applied, the model should work with a weka version >= 3.7.11 now");
} // else: good luck...
} catch (Exception e) {
e.printStackTrace();
logger.error("new weka version fixes could not be applied: " + e.getMessage());
}
} // old weka version
fixOldModelVersion(model.getSegmentationModel()); // fixOldModelVersion can handle null
fixOldModelVersion(model.getSecondarySegmentationModel()); // fixOldModelVersion can handle null
fixOldModelVersion(model.getExclusionModel()); // fixOldModelVersion can handle null
}
示例10
public static Instances preProcessData(Instances data) throws Exception{
/*
* Remove useless attributes
*/
RemoveUseless removeUseless = new RemoveUseless();
removeUseless.setOptions(new String[] { "-M", "99" }); // threshold
removeUseless.setInputFormat(data);
data = Filter.useFilter(data, removeUseless);
/*
* Remove useless attributes
*/
ReplaceMissingValues fixMissing = new ReplaceMissingValues();
fixMissing.setInputFormat(data);
data = Filter.useFilter(data, fixMissing);
/*
* Remove useless attributes
*/
Discretize discretizeNumeric = new Discretize();
discretizeNumeric.setOptions(new String[] {
"-O",
"-M", "-1.0",
"-B", "4", // no of bins
"-R", "first-last"}); //range of attributes
fixMissing.setInputFormat(data);
data = Filter.useFilter(data, fixMissing);
/*
* Select only informative attributes
*/
InfoGainAttributeEval eval = new InfoGainAttributeEval();
Ranker search = new Ranker();
search.setOptions(new String[] { "-T", "0.001" }); // information gain threshold
AttributeSelection attSelect = new AttributeSelection();
attSelect.setEvaluator(eval);
attSelect.setSearch(search);
// apply attribute selection
attSelect.SelectAttributes(data);
// remove the attributes not selected in the last run
data = attSelect.reduceDimensionality(data);
return data;
}
示例11
/**
* Parses a given list of options. Valid options are: <p/>
*
* -D <br/>
* Turn on debugging output.<p/>
*
* -W classname <br/>
* Specify the full class name of a classifier as the basis for
* collective classifying (required).<p/>
*
* -folds folds <br/>
* the number of folds for splitting the training set into train and test
* set. the first fold is always the training set. With '-V' you can invert
* this, i.e., instead of 20/80 for 5 folds you'll get 80/20. (default 5) <p/>
*
* -V <br/>
* inverts the fold selection, i.e., instead of using the first fold for the
* training set it is used for test set and the remaining folds for training.
* <p/>
*
* -S seed <br/>
* Random number seed for resampling (default 1). <p/>
*
* -verbose <br/>
* whether to output some more information during improving the classifier.
* <p/>
*
* -insight <br/>
* whether to use the labels of the original test set to output more
* statistics. <p/>
*
* -F class-spec <br/>
* The classname and parameters for the filter
* (default ReplaceMissingValues). <p/>
*
* Options after -- are passed to the designated classifier.<p/>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String tmpStr;
String[] tmpOptions;
tmpStr = Utils.getOption('F', options);
if (tmpStr.length() != 0) {
tmpOptions = Utils.splitOptions(tmpStr);
tmpStr = tmpOptions[0];
tmpOptions[0] = "";
setFilter(
(Filter) Utils.forName(Filter.class, tmpStr, tmpOptions));
}
else {
setFilter(new ReplaceMissingValues());
}
super.setOptions(options);
}
示例12
/**
* initializes the object
* @param parent the parent algorithm
* @param train the train instances
* @param test the test instances
* @param setWeights whether to set the weights for the training set
* (the processed instances)
* @throws Exception if something goes wrong
*/
public YATSIInstances(YATSI parent, Instances train, Instances test,
boolean setWeights)
throws Exception {
super();
m_Parent = parent;
// build sorted array (train + test)
double weight;
if (getParent().getNoWeights())
weight = 1.0;
else
weight = (double) train.numInstances()
/ (double) test.numInstances()
* getParent().getWeightingFactor();
m_Unprocessed = new Instance[train.numInstances() + test.numInstances()];
for (int i = 0; i < train.numInstances(); i++)
m_Unprocessed[i] = train.instance(i);
for (int i = 0; i < test.numInstances(); i++) {
m_Unprocessed[train.numInstances() + i] = test.instance(i);
m_Unprocessed[train.numInstances() + i].setWeight(weight);
}
Arrays.sort(m_Unprocessed, m_Comparator);
// weights
m_Weights = new double[m_Unprocessed.length];
for (int i = 0; i < m_Unprocessed.length; i++) {
m_Weights[i] = m_Unprocessed[i].weight();
if (!setWeights)
m_Unprocessed[i].setWeight(1);
}
// filter data
m_Trainset = new Instances(train, 0);
for (int i = 0; i < m_Unprocessed.length; i++)
m_Trainset.add(m_Unprocessed[i]);
// set up filter
m_Missing = new ReplaceMissingValues();
m_Missing.setInputFormat(m_Trainset);
m_Trainset = Filter.useFilter(m_Trainset, m_Missing);
}