Skip to content

Commit 5ba27b7

Browse files
Merge branch 'rz_rnaseqmetrics_parameterize_endbias_length' of https://github.com/watchmaker-genomics/picard into rz_rnaseqmetrics_parameterize_endbias_length
2 parents f8fa136 + ce2abb5 commit 5ba27b7

File tree

8 files changed

+272
-2
lines changed

8 files changed

+272
-2
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM openjdk:8
1+
FROM adoptopenjdk:8
22
MAINTAINER Broad Institute DSDE <[email protected]>
33

44
ARG build_command=shadowJar
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package picard.fingerprint;
2+
3+
import htsjdk.samtools.util.IOUtil;
4+
import org.broadinstitute.barclay.argparser.Argument;
5+
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
6+
import picard.PicardException;
7+
import picard.cmdline.CommandLineProgram;
8+
import picard.cmdline.StandardOptionDefinitions;
9+
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
10+
11+
import java.io.File;
12+
import java.io.FileNotFoundException;
13+
14+
@CommandLineProgramProperties(
15+
summary = "Convert Haplotype database file to vcf" +
16+
"<h3>Examples</h3>" +
17+
"<pre>" +
18+
"java -jar picard.jar ConvertHaplotypeDatabaseToVcf \\\n" +
19+
"-I haplotype_database.txt \\\n" +
20+
"-O haplotype_database.vcf.gz \\\n" +
21+
"-R reference.fasta" +
22+
"</pre>",
23+
oneLineSummary = "Convert Haplotype database file to vcf",
24+
programGroup = DiagnosticsAndQCProgramGroup.class
25+
)
26+
public class ConvertHaplotypeDatabaseToVcf extends CommandLineProgram {
27+
28+
@Argument(doc = "Haplotype database to be converted to VCF.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
29+
public File INPUT;
30+
31+
@Argument(doc = "Where to write converted haplotype database VCF.", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
32+
public File OUTPUT;
33+
34+
@Override
35+
protected boolean requiresReference() {
36+
return true;
37+
}
38+
39+
@Override
40+
protected int doWork() {
41+
IOUtil.assertFileIsReadable(INPUT);
42+
IOUtil.assertFileIsWritable(OUTPUT);
43+
44+
final HaplotypeMap haplotypeMap = new HaplotypeMap(INPUT);
45+
46+
try {
47+
haplotypeMap.writeAsVcf(OUTPUT, REFERENCE_SEQUENCE);
48+
} catch (final FileNotFoundException ex) {
49+
throw new PicardException("Problem writing haplotype map to " + OUTPUT, ex);
50+
}
51+
52+
return 0;
53+
}
54+
}

src/main/java/picard/fingerprint/CrosscheckFingerprints.java

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,16 @@ public class CrosscheckFingerprints extends CommandLineProgram {
289289
"Should only be used with SECOND_INPUT. ", optional = true)
290290
public File SECOND_INPUT_SAMPLE_MAP;
291291

292+
@Argument(doc = "A tsv with two columns representing the individual with which each sample is associated. The first column is the sample id, and the second " +
293+
"column is the associated individual id. Values in the first column must be unique. If INPUT_SAMPLE_MAP or SECOND_INPUT_SAMPLE_MAP is also specified, " +
294+
"then the values in the first column of this file should be the sample aliases specified in the second columns of INPUT_SAMPLE_MAP and SECOND_INPUT_SAMPLE_MAP, " +
295+
"respectively. When this input is specified, expectations for matches will be based on the equality or inequality of the individual ids associated with two " +
296+
"samples, as opposed to the sample ids themselves. Samples which are not listed in this file will have their sample id used as their individual id, for the " +
297+
"purposes of match expectations. This means that one sample id could be used as the individual id for another sample, but not included in the map itself, and " +
298+
"these two samples would be considered to have come from the same individual. Note that use of this parameter only affects labelling of matches and mismatches as " +
299+
"EXPECTED or UNEXPECTED. It has no affect on how data is grouped for crosschecking.", optional = true)
300+
public File SAMPLE_INDIVIDUAL_MAP;
301+
292302
@Argument(doc = "An argument that controls how crosschecking with both INPUT and SECOND_INPUT should occur. ")
293303
public Fingerprint.CrosscheckMode CROSSCHECK_MODE = CHECK_SAME_SAMPLE;
294304

@@ -368,6 +378,7 @@ public class CrosscheckFingerprints extends CommandLineProgram {
368378
private double[][] crosscheckMatrix = null;
369379
private final List<String> lhsMatrixKeys = new ArrayList<>();
370380
private final List<String> rhsMatrixKeys = new ArrayList<>();
381+
private Map<String, String> sampleIndividualMap;
371382

372383
@Override
373384
protected String[] customCommandLineValidation() {
@@ -432,6 +443,9 @@ protected int doWork() {
432443
if (SECOND_INPUT_SAMPLE_MAP != null) {
433444
IOUtil.assertFileIsReadable(SECOND_INPUT_SAMPLE_MAP);
434445
}
446+
if (SAMPLE_INDIVIDUAL_MAP != null) {
447+
IOUtil.assertFileIsReadable(SAMPLE_INDIVIDUAL_MAP);
448+
}
435449

436450
final HaplotypeMap map = new HaplotypeMap(HAPLOTYPE_MAP);
437451
final FingerprintChecker checker = new FingerprintChecker(map);
@@ -782,12 +796,22 @@ private int crossCheckFingerprints(final Map<FingerprintIdDetails, Fingerprint>
782796
// use 1L to promote size() to a long and avoid possible overflow
783797
final long totalChecks = lhsFingerprintIdDetails.size() * ((long) rhsFingerprintIdDetails.size());
784798

799+
if (SAMPLE_INDIVIDUAL_MAP != null) {
800+
final List<FingerprintIdDetails> allFingerprintIdDetails = new ArrayList<>(lhsFingerprintIdDetails);
801+
allFingerprintIdDetails.addAll(rhsFingerprintIdDetails);
802+
final Set<String> inputSamples = allFingerprintIdDetails.stream().map(id -> id.sample).collect(Collectors.toSet());
803+
804+
sampleIndividualMap = buildSampleIndividualsMap(SAMPLE_INDIVIDUAL_MAP, inputSamples);
805+
}
806+
785807
for (int row = 0; row < lhsFingerprintIdDetails.size(); row++) {
786808
final FingerprintIdDetails lhsId = lhsFingerprintIdDetails.get(row);
787809

788810
for (int col = 0; col < rhsFingerprintIdDetails.size(); col++) {
789811
final FingerprintIdDetails rhsId = rhsFingerprintIdDetails.get(col);
790-
final boolean expectedToMatch = EXPECT_ALL_GROUPS_TO_MATCH || lhsId.sample.equals(rhsId.sample);
812+
final String lhsMatchId = resolveIndividualIfPossible(lhsId.sample);
813+
final String rhsMatchId = resolveIndividualIfPossible(rhsId.sample);
814+
final boolean expectedToMatch = EXPECT_ALL_GROUPS_TO_MATCH || lhsMatchId.equals(rhsMatchId);
791815

792816
final MatchResults results = FingerprintChecker.calculateMatchResults(lhsFingerprints.get(lhsId), rhsFingerprints.get(rhsId),
793817
GENOTYPING_ERROR_RATE, LOSS_OF_HET_RATE, false, CALCULATE_TUMOR_AWARE_RESULTS);
@@ -926,4 +950,44 @@ private FingerprintResult getMatchResults(final boolean expectedToMatch, final M
926950
}
927951
}
928952
}
953+
954+
private String resolveIndividualIfPossible(final String sampleID) {
955+
if (sampleIndividualMap != null) {
956+
return sampleIndividualMap.getOrDefault(sampleID, sampleID);
957+
}
958+
959+
return sampleID;
960+
}
961+
962+
private Map<String, String> buildSampleIndividualsMap(final File sampleIndividualMapFile, final Set<String> sampleIDsInput) {
963+
if (SAMPLE_INDIVIDUAL_MAP != null) {
964+
final Map<String, String> individualMap = getStringStringMap(sampleIndividualMapFile, "SAMPLE_INDIVIDUAL_MAP");
965+
966+
//check for allowed but unusual situations that may indicate a mistake
967+
//are there any sample ids used as individual ids?
968+
for (final String individualID : individualMap.values()) {
969+
if (sampleIDsInput.contains(individualID)) {
970+
log.warn("Sample " + individualID + " is used as an individual ID in " + sampleIndividualMapFile);
971+
}
972+
}
973+
974+
//are there any sample ids in input not found in the map?
975+
for (final String sampleId : sampleIDsInput) {
976+
if (!individualMap.keySet().contains(sampleId)) {
977+
log.warn("Sample " + sampleId + " in input data not found in " + sampleIndividualMapFile);
978+
}
979+
}
980+
981+
//are there any sample ids listed in the map that are not found in input data?
982+
for (final String sampleId : individualMap.keySet()) {
983+
if (!sampleIDsInput.contains(sampleId)) {
984+
log.warn("Sample " + sampleId + " in " + sampleIndividualMapFile + " not found in input data");
985+
}
986+
}
987+
988+
return individualMap;
989+
}
990+
991+
return null;
992+
}
929993
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package picard.fingerprint;
2+
3+
import org.testng.Assert;
4+
import org.testng.annotations.Test;
5+
import picard.cmdline.CommandLineProgramTest;
6+
7+
import java.io.File;
8+
import java.io.IOException;
9+
import java.util.ArrayList;
10+
import java.util.Collections;
11+
import java.util.List;
12+
13+
public class ConvertHaplotypeDatabaseToVcfTest extends CommandLineProgramTest {
14+
private final File TEST_DATA_DIR = new File("testdata/picard/fingerprint/");
15+
private final File TEST_MAP =
16+
new File(TEST_DATA_DIR, "haplotypeMap_small.txt");
17+
18+
private final File TEST_FASTA =
19+
new File(TEST_DATA_DIR, "reference.fasta");
20+
21+
@Override
22+
public String getCommandLineProgramName() {
23+
return ConvertHaplotypeDatabaseToVcf.class.getSimpleName();
24+
}
25+
26+
@Test
27+
public void testConvertHaplotypeDatabaseToVcf() throws IOException {
28+
final File outfile = getTempOutputFile("testConvertHaplotypeDatabase", ".vcf");
29+
final String[] args = new String[]{
30+
"I=" + TEST_MAP.getAbsolutePath(),
31+
"R=" + TEST_FASTA.getAbsolutePath(),
32+
"O=" + outfile.getAbsolutePath()
33+
};
34+
35+
runPicardCommandLine(args);
36+
37+
final HaplotypeMap hapDBFromTxt = new HaplotypeMap(TEST_MAP);
38+
final HaplotypeMap hapDBFromVcf = new HaplotypeMap(outfile);
39+
40+
assertHaplotypeMapsAreEquivalent(hapDBFromTxt, hapDBFromVcf);
41+
}
42+
43+
private void assertHaplotypeMapsAreEquivalent(final HaplotypeMap hapMap1, final HaplotypeMap hapMap2) {
44+
final List<HaplotypeBlock> hapBlocks1 = new ArrayList<>(hapMap1.getHaplotypes());
45+
final List<HaplotypeBlock> hapBlocks2 = new ArrayList<>(hapMap2.getHaplotypes());
46+
47+
Assert.assertEquals(hapBlocks1.size(), hapBlocks2.size());
48+
49+
Collections.sort(hapBlocks1);
50+
Collections.sort(hapBlocks2);
51+
52+
for (int i = 0; i < hapBlocks1.size(); i++) {
53+
assertHaplotypeBlocksAreEquivalent(hapBlocks1.get(i), hapBlocks2.get(i));
54+
}
55+
}
56+
57+
private void assertHaplotypeBlocksAreEquivalent(final HaplotypeBlock hapBlock1, final HaplotypeBlock hapBlock2) {
58+
//equals method only checks for equality of domain
59+
Assert.assertEquals(hapBlock1, hapBlock2);
60+
61+
final List<Snp> snps1 = new ArrayList<>(hapBlock1.getSnps());
62+
final List<Snp> snps2 = new ArrayList<>(hapBlock2.getSnps());
63+
64+
Assert.assertEquals(snps1.size(), snps2.size());
65+
66+
Collections.sort(snps1);
67+
Collections.sort(snps2);
68+
69+
for (int i = 0; i < snps1.size(); i++) {
70+
assertSnpsAreEquivalent(snps1.get(i), snps2.get(i));
71+
}
72+
}
73+
74+
private void assertSnpsAreEquivalent(final Snp snp1, final Snp snp2) {
75+
//equals method only checks for equality of domain
76+
Assert.assertEquals(snp1, snp2);
77+
78+
Assert.assertEquals(snp1.getName(), snp2.getName());
79+
80+
//allele order may have been swapped
81+
if (snp1.getAllele1() == snp2.getAllele1() && snp1.getAllele2() == snp2.getAllele2()) {
82+
Assert.assertEquals(snp1.getMaf(), snp2.getMaf(), 0.0005); //equal rounded to third decimal
83+
} else if (snp1.getAllele1() == snp2.getAllele2() && snp1.getAllele2() == snp2.getAllele1()) {
84+
Assert.assertEquals(snp1.getMaf(), 1 - snp2.getMaf(), 0.0005); //equal rounded to third decimal
85+
} else {
86+
throw new AssertionError("Snps " + snp1.getName() + " have different alleles");
87+
}
88+
}
89+
}

src/test/java/picard/fingerprint/CrosscheckFingerprintsTest.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ public class CrosscheckFingerprintsTest extends CommandLineProgramTest {
6161
private File NA12891_r1_shifted_bam, NA12891_r2_shifted_bam, NA12892_r1_shifted_bam, NA12892_r2_shifted_bam;
6262
private final File referenceForCrams = new File(TEST_DATA_DIR, "reference.shifted.for.crams.fasta");
6363

64+
private final File NA12891_NA12892_diff_individuals = new File(TEST_DATA_DIR, "NA12891_NA12892_different_individuals.txt");
65+
private final File NA12891_NA12892_same_individual = new File(TEST_DATA_DIR, "NA12891_NA12892_same_individual.txt");
66+
6467
private final int NA12891_r1_RGs = 27;
6568
private final int NA12891_r2_RGs = 26;
6669
private final int NA12892_r1_RGs = 25;
@@ -432,6 +435,40 @@ public void testCrossCheckSMs(final File file1, final File file2, final int expe
432435
Assert.assertEquals(matrixParser.columnLabelsList().size(), numberOfSamples + 1 );
433436
}
434437

438+
@DataProvider(name = "bamFilesIndividuals")
439+
public Object[][] bamFilesIndividuals() {
440+
return new Object[][] {
441+
{NA12891_r1, NA12892_r1, NA12891_NA12892_diff_individuals, 0, 2},
442+
{NA12891_r1, NA12892_r1, NA12891_NA12892_same_individual, 1, 2}, // same individual, so expects match
443+
{NA12891_r2, NA12891_named_NA12892_r1, NA12891_NA12892_diff_individuals, 1, 2}, // diff individuals, so do not expect match
444+
{NA12891_r2, NA12891_named_NA12892_r1, NA12891_NA12892_same_individual, 0, 2} //same individual, so expect match
445+
};
446+
}
447+
448+
@Test(dataProvider = "bamFilesIndividuals")
449+
public void testCrossCheckIndividuals(final File file1, final File file2, final File individualsMap, final int expectedRetVal, final int numberOfSamples) throws IOException {
450+
File metrics = File.createTempFile("Fingerprinting", "NA1291.SM.crosscheck_metrics");
451+
metrics.deleteOnExit();
452+
453+
File matrix = File.createTempFile("Fingerprinting", "NA1291.SM.matrix");
454+
matrix.deleteOnExit();
455+
456+
final String[] args = new String[]{
457+
"INPUT=" + file1.getAbsolutePath(),
458+
"INPUT=" + file2.getAbsolutePath(),
459+
"OUTPUT=" + metrics.getAbsolutePath(),
460+
"MATRIX_OUTPUT=" + matrix.getAbsolutePath(),
461+
"HAPLOTYPE_MAP=" + HAPLOTYPE_MAP,
462+
"LOD_THRESHOLD=" + -1.0,
463+
"CROSSCHECK_BY=SAMPLE",
464+
"SAMPLE_INDIVIDUAL_MAP=" + individualsMap.getAbsolutePath()
465+
};
466+
doTest(args, metrics, expectedRetVal, numberOfSamples * numberOfSamples , CrosscheckMetric.DataType.SAMPLE);
467+
468+
TabbedTextFileWithHeaderParser matrixParser = new TabbedTextFileWithHeaderParser(matrix);
469+
Assert.assertEquals(matrixParser.columnLabelsList().size(), numberOfSamples + 1 );
470+
}
471+
435472
@DataProvider(name = "checkSamplesData")
436473
public Iterator<Object[]> checkSamplesData() {
437474
List<Object[]> tests = new ArrayList<>();
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
NA12891 Individual1
2+
NA12892 Individual2
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
NA12891 Individual1
2+
NA12892 Individual1
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
@HD VN:1.6 SO:unsorted
2+
@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67
3+
@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310
4+
@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295
5+
@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1
6+
@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda
7+
@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d
8+
@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf
9+
@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63
10+
#CHROMOSOME POSITION NAME MAJOR_ALLELE MINOR_ALLELE MAF ANCHOR_SNP PANELS
11+
chr1 75 rs1 C A 0.159601 foo
12+
chr1 100 rs2 A G 0.223794 foo
13+
chr2 75 rs3 C G 0.437188 foo
14+
chr3 25 rs9 C T 0.722731 foo
15+
chr3 50 rs7 T C 0.740000 rs6 foo
16+
chr3 75 rs6 A T 0.623026 foo
17+
chr3 100 rs8 A T 0.260000 rs6 foo
18+
chr4 75 rs10 A G 0.205686 foo
19+
chr5 75 rs11 C G 0.558528 foo
20+
chr6 75 rs15 G C 0.452579 foo
21+
chr7 75 rs16 A T 0.662917 foo
22+
chr8 75 rs17 G A 0.392887 foo

0 commit comments

Comments
 (0)