Skip to content

Commit ab6c58a

Browse files
committed
Added a new fungalits_warcup gene for fungal its classification
Added a new biom format output
1 parent 19aab5b commit ab6c58a

File tree

10 files changed

+259
-60
lines changed

10 files changed

+259
-60
lines changed

nbproject/project.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ javac.classpath=\
3939
${file.reference.commons-io-2.4.jar}:\
4040
${file.reference.jcommon.jar}:\
4141
${file.reference.jfreechart-1.0.13.jar}:\
42+
${reference.AlignmentTools.jar}:\
4243
${reference.ReadSeq.jar}:\
43-
${reference.TaxonomyTree.jar}:\
44-
${reference.AlignmentTools.jar}
44+
${reference.TaxonomyTree.jar}
4545

4646
# Space-separated list of extra javac options
4747
javac.compilerargs=

src/edu/msu/cme/rdp/classifier/cli/CmdOptions.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ public class CmdOptions {
4545
public static final String TAXON_LONG_OPT = "taxon";
4646
public static final String BOOTSTRAP_OUTFILE_LONG_OPT = "bootstrap_outfile";
4747
public static final String SHORTSEQ_OUTFILE_LONG_OPT = "shortseq_outfile";
48-
48+
public static final String BIOMFILE_LONG_OPT = "biomFile";
49+
public static final String SAMPLE_LONG_OPT = "biomFile";
50+
public static final String METADATA_LONG_OPT = "metadata";
4951

5052
//short options
5153
public static final String QUERYFILE_SHORT_OPT = "q";
@@ -60,23 +62,29 @@ public class CmdOptions {
6062
public static final String TAXON_SHORT_OPT = "n";
6163
public static final String BOOTSTRAP_OUTFILE_SHORT_OPT = "b";
6264
public static final String SHORTSEQ_OUTFILE_SHORT_OPT = "s";
65+
public static final String BIOMFILE_SHORT_OPT = "m";
66+
public static final String METADATA_SHORT_OPT = "d";
6367

6468
// description of the options
6569
public static final String QUERYFILE_DESC = "legacy option, no longer needed ";
6670
public static final String OUTFILE_DESC = "tab-delimited text output file for classification assignment.";
6771
public static final String TRAINPROPFILE_DESC = "property file containing the mapping of the training files if not using the default."
6872
+ " Note: the training files and the property file should be in the same directory.";
69-
public static final String FORMAT_DESC = "tab-delimited output format: [allrank|fixrank|filterbyconf|db]. Default is " + DEFAULT_FORMAT + "."
73+
public static final String FORMAT_DESC = "tab-delimited output format: [allrank|fixrank|biom|filterbyconf|db]. Default is " + DEFAULT_FORMAT + "."
7074
+ "\n allrank: outputs the results for all ranks applied for each sequence: seqname, orientation, taxon name, rank, conf, ..."
7175
+ "\n fixrank: only outputs the results for fixed ranks in order: domain, phylum, class, order, family, genus"
76+
+ "\n biom: outputs rich dense biom format if OTU or metadata provided"
7277
+ "\n filterbyconf: only outputs the results for major ranks as in fixrank, results below the confidence cutoff were bin to a higher rank unclassified_node"
7378
+ "\n db: outputs the seqname, trainset_no, tax_id, conf.";
74-
public static final String GENE_DESC = ClassifierFactory.RRNA_16S_GENE + " or " + ClassifierFactory.FUNGALLSU_GENE
75-
+ ", the default gene is " + DEFAULT_GENE + ". This option will be overwritten by --train_propfile option";
79+
public static final String GENE_DESC = ClassifierFactory.RRNA_16S_GENE + ", " + ClassifierFactory.FUNGALLSU_GENE
80+
+ ", " + ClassifierFactory.FUNGALITS_warcup_GENE //+ ", " + ClassifierFactory.FUNGALITS_unite_GENE
81+
+ ". Default is " + DEFAULT_GENE + ". This option can be overwritten by -t option";
7682
public static final String MIN_WORDS_DESC = "minimum number of words for each bootstrap trial. Default(maximum) is 1/8 of the words of each sequence. Minimum is " + Classifier.MIN_BOOTSTRSP_WORDS ;
7783
public static final String HIER_OUTFILE_DESC = "tab-delimited output file containing the assignment count for each taxon in the hierarchical format. Default is null.";
7884
public static final String BOOTSTRAP_DESC = "assignment confidence cutoff used to determine the assignment count for each taxon. Range [0-1], Default is " + DEFAULT_CONF + ".";
7985

8086
public static final String BOOTSTRAP_OUTFILE_DESC = "the output file containing the number of matching assignments out of 100 bootstraps for major ranks. Default is null";
8187
public static final String SHORTSEQ_OUTFILE_DESC = "the output file containing the sequence names that are too short to be classified";
88+
public static final String BIOMFILE_DESC = "the input clluster biom file. The classification result will replace the taxonomy of the corresponding cluster id.";
89+
public static final String METADATA_DESC = "the tab delimited metadata file for the samples, with first row containing attribute name and first column containing the sample name";
8290
}

src/edu/msu/cme/rdp/classifier/io/ClassificationResultFormatter.java

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
import edu.msu.cme.rdp.classifier.ClassificationResult;
2020
import edu.msu.cme.rdp.classifier.RankAssignment;
21+
import java.util.ArrayList;
22+
import java.util.Arrays;
2123
import java.util.HashMap;
2224
import java.util.Iterator;
2325
import java.util.List;
@@ -30,26 +32,31 @@ public class ClassificationResultFormatter {
3032
// list of major rankd
3133

3234
public static String[] RANKS = { "domain", "phylum", "class", "order", "family", "genus"};
35+
public static String[] RANKS_WITHSPECIES = { "domain", "phylum", "class", "order", "family", "genus", "species"};
36+
public static final List<ClassificationResultFormatter.FORMAT> fileFormats
37+
= new ArrayList(Arrays.asList(FORMAT.allRank,FORMAT.dbformat,FORMAT.fixRank,FORMAT.filterbyconf,FORMAT.biom));
3338

3439
public enum FORMAT {
3540

36-
allRank, fixRank, dbformat, filterbyconf;
41+
allRank, fixRank, dbformat, filterbyconf, biom;
3742
}
3843

3944
public static String getOutput(ClassificationResult result, FORMAT format){
40-
return getOutput(result, format, 0f);
45+
return getOutput(result, format, 0f, RANKS);
4146
}
4247

43-
public static String getOutput(ClassificationResult result, FORMAT format, float conf) {
48+
public static String getOutput(ClassificationResult result, FORMAT format, float conf, String[] ranks) {
4449
switch (format) {
4550
case allRank:
4651
return getAllRankOutput(result);
4752
case fixRank:
48-
return getFixRankOutput(result);
53+
return getFixRankOutput(ranks, result);
4954
case dbformat:
5055
return getDBOutput(result, conf);
5156
case filterbyconf:
52-
return getFilterByConfOutput(result, conf);
57+
return getFilterByConfOutput(ranks, result, conf);
58+
case biom:
59+
return getBiomOutput(ranks, result, conf, ';');
5360
default:
5461
getAllRankOutput(result);
5562
}
@@ -131,39 +138,72 @@ public static String getFilterByConfOutput(String[] ranks, ClassificationResult
131138
for (RankAssignment assignment : (List<RankAssignment>) result.getAssignments()) {
132139
rankMap.put(assignment.getRank().toLowerCase(), assignment);
133140
}
134-
135141
// if the score is missing for the rank, report the conf and name from the lower rank if above the conf
136142
// if the lower rank is below the conf, output unclassified node name and the conf from the one above the conf
137-
RankAssignment prevAssign = null;
138-
for (int i = ranks.length -1; i>=0; i--) {
143+
RankAssignment prevAssign = result.getAssignments().get(0);
144+
assignmentStr.append(result.getSequence().getSeqName());
145+
for (int i = 0; i <= ranks.length -1; i++) {
139146
RankAssignment assign = rankMap.get(ranks[i]);
140147
if (assign != null) {
141-
if ( assign.getConfidence() >= conf){
142-
if ( prevAssign != null && prevAssign.getConfidence() < conf){
143-
assignmentStr.insert(0, "\t" + "unclassified_" + assign.getName() +"\t" + ranks[i+1] + "\t" + assign.getConfidence());
144-
}
145-
assignmentStr.insert(0, "\t" + assign.getName() +"\t" + assign.getRank() + "\t" + assign.getConfidence());
148+
if ( assign.getConfidence() <= conf){
149+
assignmentStr.append("\t" + "unclassified_" + prevAssign.getName() );
150+
}else {
151+
assignmentStr.append("\t" + assign.getName() );
152+
prevAssign = assign;
146153
}
147-
prevAssign = assign;
154+
148155
} else {
149156
if ( prevAssign != null && prevAssign.getConfidence() >= conf){
150-
assignmentStr.insert(0, "\t" + prevAssign.getName() +"\t" + ranks[i] + "\t" + prevAssign.getConfidence());
157+
assignmentStr.append("\t" + "unclassified_" + prevAssign.getName() );
151158
}
152159
}
153160

154-
}
155-
if (result.isReverse()) {
156-
assignmentStr.insert(0,"-");
157-
} else {
158-
assignmentStr.insert(0, "");
159-
}
160-
assignmentStr.insert(0, result.getSequence().getSeqName() + "\t");
161+
}
161162
assignmentStr.append("\n");
162-
163163
return assignmentStr.toString();
164164

165165
}
166166

167+
/**
168+
* Output the classification result suitable to load into biom format.
169+
* Concatenate the rank and the taxon name, remove quotes in the taxon name
170+
*/
171+
public static String getBiomOutput(String[] ranks, ClassificationResult result, float conf, char delimiter) {
172+
StringBuilder assignmentStr = new StringBuilder();
173+
174+
HashMap<String, RankAssignment> rankMap = new HashMap<String, RankAssignment>();
175+
for (RankAssignment assignment : (List<RankAssignment>) result.getAssignments()) {
176+
rankMap.put(assignment.getRank().toLowerCase(), assignment);
177+
}
178+
// if the score is missing for the rank, report the conf and name from the lower rank if above the conf
179+
// if the lower rank is below the conf, output unclassified node name and the conf from the one above the conf
180+
// remove the quotes in the name
181+
RankAssignment prevAssign = result.getAssignments().get(0);
182+
assignmentStr.append(result.getSequence().getSeqName() + "\t");
183+
for (int i = 0; i <= ranks.length -1; i++) {
184+
RankAssignment assign = rankMap.get(ranks[i]);
185+
String rank = RANKS[i].substring(0,1).toLowerCase();
186+
if (assign != null) {
187+
if ( assign.getConfidence() <= conf){
188+
assignmentStr.append(rank + "__" + "unclassified_" + prevAssign.getName().replaceAll("\"", "") );
189+
}else {
190+
assignmentStr.append( rank + "__"+ assign.getName().replaceAll("\"", "") );
191+
prevAssign = assign;
192+
}
193+
194+
} else {
195+
if ( prevAssign != null && prevAssign.getConfidence() >= conf){
196+
assignmentStr.append( rank + "__" + "unclassified_" + prevAssign.getName().replaceAll("\"", "") );
197+
}
198+
}
199+
200+
if ( i < ranks.length -1){
201+
assignmentStr.append(delimiter);
202+
}
203+
}
204+
return assignmentStr.toString();
205+
206+
}
167207

168208
public static String getDBOutput(ClassificationResult result, float conf) {
169209
StringBuilder assignmentStr = new StringBuilder();

src/edu/msu/cme/rdp/classifier/train/validation/distance/BoxPlotUtils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ public static void readData(String inFile, File outdir, String xAxisLabel, Strin
7575
title = title.substring(0, index);
7676
}
7777

78-
Font lableFont = new Font("Helvetica", Font.BOLD, 30);
79-
createBoxplot(scatterDataset, new PrintStream(new File(outdir, title + "_boxchart.png")), title, xAxisLabel, yAxisLabel, lableFont);
78+
Font lableFont = new Font("Helvetica", Font.BOLD, 28);
79+
createBoxplot(scatterDataset, new PrintStream(new File(outdir, title + ".boxchart.png")), title, xAxisLabel, yAxisLabel, lableFont);
8080

8181
}
8282

src/edu/msu/cme/rdp/classifier/train/validation/distance/CompareTrainingSets.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121
import edu.msu.cme.rdp.classifier.train.LineageSequenceParser;
2222
import edu.msu.cme.rdp.classifier.train.validation.HierarchyTree;
2323
import edu.msu.cme.rdp.classifier.train.validation.TreeFactory;
24-
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
2524
import java.io.BufferedReader;
2625
import java.io.File;
26+
import java.io.FileInputStream;
2727
import java.io.FileReader;
2828
import java.io.IOException;
29+
import java.io.InputStreamReader;
2930
import java.io.PrintStream;
3031
import java.io.Reader;
32+
import java.nio.charset.Charset;
3133
import java.util.ArrayList;
3234
import java.util.Arrays;
3335
import java.util.HashMap;
@@ -78,26 +80,31 @@ public CompareTrainingSets(String rankFile, String[] files) throws IOException{
7880
}
7981

8082
private HierarchyTreeExtend parseOneTraining(String taxFile, String seqFile, int trainset_no, String version, String modification) throws IOException{
81-
Reader tax = new FileReader(taxFile);
8283
File temp = new File(taxFile);
8384
int index = temp.getName().indexOf(".");
8485
String trainsetName = temp.getName();
8586
if ( index != -1){
8687
trainsetName = trainsetName.substring(0, index);
8788
}
89+
// need to use ISO encoding for UNITE
90+
FileReader tax = new FileReader(new File(taxFile));
8891
TreeFactory factory = new TreeFactory(tax);
92+
8993
LineageSequenceParser parser = new LineageSequenceParser(new File(seqFile));
9094
LineageSequence seq;
9195
HashMap<String, String> seqMap = new HashMap<String, String>(); // seqID, desc
9296
while ( parser.hasNext()){
9397
seq = parser.next();
9498
factory.addSequence(seq, false); // donot check the kmers
9599

96-
if ( !seq.getSeqName().contains("|SH") ){
97-
seqMap.put(seq.getSeqName(), seq.getDesc());
98-
}else {// if it's seq from UNITE, we need to do something with the seqID
100+
if ( seq.getSeqName().contains("|S00") ){ // rdpID
101+
String[] values = seq.getSeqName().split("\\|");
102+
seqMap.put(values[0], seq.getDesc());
103+
}else if (seq.getSeqName().contains("|SH") ){ // if it's seq from UNITE, we need to do something with the seqID
99104
String[] values = seq.getSeqName().split("\\|");
100-
seqMap.put(values[1], seq.getDesc());
105+
seqMap.put(values[1], seq.getDesc());
106+
}else {
107+
seqMap.put(seq.getSeqName(), seq.getDesc());
101108
}
102109

103110
}

src/edu/msu/cme/rdp/classifier/train/validation/distance/TaxaSimilarityMain.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ public void createPlot(String plotTitle, File outdir) throws IOException{
249249
XYSeriesCollection dataset = new XYSeriesCollection();
250250
DefaultBoxAndWhiskerCategoryDataset scatterDataset = new DefaultBoxAndWhiskerCategoryDataset();
251251

252-
PrintStream boxchart_dataStream = new PrintStream(new File(outdir, plotTitle + "_boxchart.txt"));
252+
PrintStream boxchart_dataStream = new PrintStream(new File(outdir, plotTitle + ".boxchart.txt"));
253253

254254
boxchart_dataStream.println("#\tkmer" + "\trank" + "\t" + "max" + "\t" + "avg" + "\t" + "min" +
255255
"\t" + "Q1" + "\t" + "median" + "\t" + "Q3" + "\t" + "98Pct" + "\t" + "2Pct" + "\t" + "comparisons");
@@ -266,7 +266,7 @@ public void createPlot(String plotTitle, File outdir) throws IOException{
266266
int Q3 = -1;
267267
int pct_98 =-1;
268268
int pct_2 = -1;
269-
int comparisons = 0;
269+
long comparisons = 0;
270270
int minOutlier = 0; // we don't care about the outliers
271271
int maxOutlier = 0; //
272272

@@ -319,7 +319,7 @@ public void createPlot(String plotTitle, File outdir) throws IOException{
319319
}
320320
}
321321
boxchart_dataStream.close();
322-
Font lableFont = new Font("Helvetica", Font.BOLD, 30);
322+
Font lableFont = new Font("Helvetica", Font.BOLD, 28);
323323

324324
JFreeChart chart = ChartFactory.createXYLineChart(plotTitle, "Similarity%", "Percent Comparisions", dataset, PlotOrientation.VERTICAL, true, true, false );
325325
((XYPlot) chart.getPlot()).getRenderer().setStroke( new BasicStroke( 2.0f ));
@@ -332,9 +332,9 @@ public void createPlot(String plotTitle, File outdir) throws IOException{
332332
rangeAxis.setTickLabelFont(lableFont);
333333
rangeAxis.setLabelFont(lableFont);
334334
((NumberAxis)rangeAxis).setTickUnit(new NumberTickUnit(5));
335-
ChartUtilities.writeScaledChartAsPNG(new PrintStream(new File(outdir, plotTitle + "_linechart.png")), chart, 800, 1000, 3, 3);
335+
ChartUtilities.writeScaledChartAsPNG(new PrintStream(new File(outdir, plotTitle + ".linechart.png")), chart, 800, 1000, 3, 3);
336336

337-
BoxPlotUtils.createBoxplot(scatterDataset, new PrintStream(new File(outdir, plotTitle + "_boxchart.png")), plotTitle, "Rank", "Similarity%", lableFont);
337+
BoxPlotUtils.createBoxplot(scatterDataset, new PrintStream(new File(outdir, plotTitle + ".boxchart.png")), plotTitle, "Rank", "Similarity%", lableFont);
338338

339339
}
340340

@@ -378,7 +378,6 @@ public static void main(String[] args) throws IOException, OverlapCheckFailedExc
378378
}
379379
if ( args[6].equalsIgnoreCase("sab")){
380380
theObj.calSabSimilarity(args[0], args[1], args[2]);
381-
plotTitle = kmer + "mer_" + plotTitle;
382381
}else {
383382
theObj.calPairwiseSimilaritye(args[0], args[1], args[2]);
384383
}

src/edu/msu/cme/rdp/classifier/train/validation/leaveoneout/LeaveOneOutTesterMain.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,10 @@ public class LeaveOneOutTesterMain {
7373
public LeaveOneOutTesterMain(String taxFile, String trainseqFile, String testFile, String outFile,
7474
int numGoodBases, int min_bootstrap_words, boolean hideTaxon) throws IOException {
7575
boolean useSeed = true; // use seed for random word selection
76-
System.err.println("#before TreeFactory\tfree=" + Runtime.getRuntime().freeMemory()/1000000 + "\ttotal=" + Runtime.getRuntime().totalMemory()/1000000 );
7776

7877
TreeFactory factory = new TreeFactory(new FileReader(taxFile));
7978
// create a tree
80-
System.err.println("#before craeteTree\tfree=" + Runtime.getRuntime().freeMemory()/1000000 + "\ttotal=" + Runtime.getRuntime().totalMemory()/1000000 );
81-
8279
createTree(factory, trainseqFile);
83-
System.err.println("#after craeteTree\tfree=" + Runtime.getRuntime().freeMemory()/1000000 + "\ttotal=" + Runtime.getRuntime().totalMemory()/1000000 );
8480

8581
BufferedWriter outWriter = new BufferedWriter(new FileWriter(outFile));
8682
LineageSequenceParser parser = new LineageSequenceParser(new File(testFile));

src/edu/msu/cme/rdp/classifier/utils/ClassifierFactory.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ public class ClassifierFactory {
2929

3030
public static final String RRNA_16S_GENE = "16srrna";
3131
public static final String FUNGALLSU_GENE = "fungallsu";
32+
public static final String FUNGALITS_warcup_GENE = "fungalits_warcup";
33+
public static final String FUNGALITS_unite_GENE = "fungalits_unite";
3234
private TrainingInfo trainingInfo;
3335
private static Properties urlProperties;
3436
private static String classifierVersion;

0 commit comments

Comments
 (0)