Compare commits
17 Commits
b7c86f20b3
...
756e5572b9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
756e5572b9 | ||
|
|
c30167d5ec | ||
|
|
a19525f5bb | ||
|
|
e5803defa3 | ||
|
|
34dc2a5721 | ||
|
|
fd106a0d73 | ||
|
|
22faad3414 | ||
|
|
0b36e2b742 | ||
|
|
9dacd8cd34 | ||
|
|
89687fa849 | ||
|
|
fb443fe958 | ||
|
|
adebe1542e | ||
|
|
882fbfffc6 | ||
|
|
a88cfb8b0d | ||
|
|
deed98e79d | ||
|
|
1a35600f50 | ||
|
|
856063529b |
11
readme.md
11
readme.md
@@ -96,7 +96,7 @@ These files are often generated in sequence. When entering filenames, it is not
|
||||
(.csv or .ser). When reading or writing files, the program will automatically add the correct extension to any filename
|
||||
without one.
|
||||
|
||||
To save file I/O time, the most recent instance of each of these four
|
||||
To save file I/O time when using the interactive interface, the most recent instance of each of these four
|
||||
files either generated or read from disk can be cached in program memory. When caching is active, subsequent uses of the
|
||||
same data file won't need to be read in again until another file of that type is used or generated,
|
||||
or caching is turned off for that file type. The program checks whether it needs to update its cached data by comparing
|
||||
@@ -160,7 +160,7 @@ Options when making a Sample Plate file:
|
||||
* Number of sections on plate
|
||||
* Number of T cells per well
|
||||
* per section, if more than one section
|
||||
* Dropout rate
|
||||
* Sequence dropout rate
|
||||
|
||||
Files are in CSV format. There are no header labels. Every row represents a well.
|
||||
Every value represents an individual cell, containing four sequences, depicted as an array string:
|
||||
@@ -204,6 +204,7 @@ Options for creating a Graph/Data file:
|
||||
* The read depth (number of times each sequence is read)
|
||||
* The read error rate (probability a sequence is misread)
|
||||
* The error collision rate (probability two misreads produce the same spurious sequence)
|
||||
* The real sequence collision rate (probability that a misread will produce a different, real sequence from the sample plate. Only applies to new misreads; once an error of this type has occurred, it's likelihood of ocurring again is dominated by the error collision probability.)
|
||||
|
||||
These files do not have a human-readable structure, and are not portable to other programs.
|
||||
|
||||
@@ -211,8 +212,8 @@ These files do not have a human-readable structure, and are not portable to othe
|
||||
|
||||
For portability of graph data to other software, turn on [GraphML](http://graphml.graphdrawing.org/index.html) output
|
||||
in the Options menu in interactive mode, or use the `-graphml`command line argument. This will produce a .graphml file
|
||||
for the weighted graph, with vertex attributes for sequence, type, and occupancy data. This graph contains all the data
|
||||
necessary for the BiGpairSEQ matching algorithm. It does not include the data to measure pairing accuracy; for that,
|
||||
for the weighted graph, with vertex attributes for sequence, type, total occupancy, total read count, and the read count for every individual occupied well.
|
||||
This graph contains all the data necessary for the BiGpairSEQ matching algorithm. It does not include the data to measure pairing accuracy; for that,
|
||||
compare the matching results to the original Cell Sample .csv file.
|
||||
|
||||
---
|
||||
@@ -361,8 +362,10 @@ roughly as though it had a constant well population equal to the plate's average
|
||||
* ~~Add read depth simulation options to CLI~~ DONE
|
||||
* ~~Update graphml output to reflect current Vertex class attributes~~ DONE
|
||||
* Individual well data from the SequenceRecords could be included, if there's ever a reason for it
|
||||
* ~~Implement simulation of sequences being misread as other real sequence~~ DONE
|
||||
* Update matching metadata output options in CLI
|
||||
* Update performance data in this readme
|
||||
* Add section to ReadMe describing data filtering methods.
|
||||
* Re-implement CDR1 matching method
|
||||
* Refactor simulator code to collect all needed data in a single scan of the plate
|
||||
* Currently it scans once for the vertices and then again for the edge weights. This made simulating read depth awkward, and incompatible with caching of plate files.
|
||||
|
||||
@@ -150,16 +150,21 @@ public class CommandLineInterface {
|
||||
Integer readDepth = 1;
|
||||
Double readErrorRate = 0.0;
|
||||
Double errorCollisionRate = 0.0;
|
||||
Double realSequenceCollisionRate = 0.0;
|
||||
if (line.hasOption("rd")) {
|
||||
readDepth = Integer.parseInt(line.getOptionValue("rd"));
|
||||
}
|
||||
if (line.hasOption("err")) {
|
||||
readErrorRate = Double.parseDouble(line.getOptionValue("err"));
|
||||
}
|
||||
if (line.hasOption("coll")) {
|
||||
errorCollisionRate = Double.parseDouble(line.getOptionValue("coll"));
|
||||
if (line.hasOption("errcoll")) {
|
||||
errorCollisionRate = Double.parseDouble(line.getOptionValue("errcoll"));
|
||||
}
|
||||
graph = Simulator.makeCDR3Graph(cells, plate, readDepth, readErrorRate, errorCollisionRate, false);
|
||||
if (line.hasOption("realcoll")) {
|
||||
realSequenceCollisionRate = Double.parseDouble(line.getOptionValue("realcoll"));
|
||||
}
|
||||
graph = Simulator.makeCDR3Graph(cells, plate, readDepth, readErrorRate, errorCollisionRate,
|
||||
realSequenceCollisionRate, false);
|
||||
if (!line.hasOption("no-binary")) { //output binary file unless told not to
|
||||
GraphDataObjectWriter writer = new GraphDataObjectWriter(outputFilename, graph, false);
|
||||
writer.writeDataToFile();
|
||||
@@ -413,12 +418,20 @@ public class CommandLineInterface {
|
||||
.hasArg()
|
||||
.argName("prob")
|
||||
.build();
|
||||
Option errorCollisionProb = Option.builder("coll")
|
||||
Option errorCollisionProb = Option.builder("errcoll")
|
||||
.longOpt("error-collision-prob")
|
||||
.desc("(Optional) The probability that two misreads will produce the same spurious sequence. (0.0 - 1.0)")
|
||||
.hasArg()
|
||||
.argName("prob")
|
||||
.build();
|
||||
Option realSequenceCollisionProb = Option.builder("realcoll")
|
||||
.longOpt("real-collision-prob")
|
||||
.desc("(Optional) The probability that a sequence will be misread " +
|
||||
"as another real sequence. (Only applies to unique misreads; after this has happened once, " +
|
||||
"future error collisions could produce the real sequence again) (0.0 - 1.0)")
|
||||
.hasArg()
|
||||
.argName("prob")
|
||||
.build();
|
||||
graphOptions.addOption(cellFilename);
|
||||
graphOptions.addOption(plateFilename);
|
||||
graphOptions.addOption(outputFileOption());
|
||||
@@ -427,6 +440,7 @@ public class CommandLineInterface {
|
||||
graphOptions.addOption(readDepth);
|
||||
graphOptions.addOption(readErrorProb);
|
||||
graphOptions.addOption(errorCollisionProb);
|
||||
graphOptions.addOption(realSequenceCollisionProb);
|
||||
return graphOptions;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ import org.jgrapht.nio.AttributeType;
|
||||
import org.jgrapht.nio.DefaultAttribute;
|
||||
import org.jgrapht.nio.graphml.GraphMLExporter;
|
||||
import org.jgrapht.nio.graphml.GraphMLExporter.AttributeCategory;
|
||||
import org.w3c.dom.Attr;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
@@ -13,6 +12,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
public class GraphMLFileWriter {
|
||||
@@ -41,11 +41,11 @@ public class GraphMLFileWriter {
|
||||
}
|
||||
|
||||
private Map<String, Attribute> createGraphAttributes(){
|
||||
Map<String, Attribute> ga = new HashMap<>();
|
||||
Map<String, Attribute> attributes = new HashMap<>();
|
||||
//Sample plate filename
|
||||
ga.put("sample plate filename", DefaultAttribute.createAttribute(data.getSourceFilename()));
|
||||
attributes.put("sample plate filename", DefaultAttribute.createAttribute(data.getSourceFilename()));
|
||||
// Number of wells
|
||||
ga.put("well count", DefaultAttribute.createAttribute(data.getNumWells().toString()));
|
||||
attributes.put("well count", DefaultAttribute.createAttribute(data.getNumWells().toString()));
|
||||
//Well populations
|
||||
Integer[] wellPopulations = data.getWellPopulations();
|
||||
StringBuilder populationsStringBuilder = new StringBuilder();
|
||||
@@ -55,11 +55,37 @@ public class GraphMLFileWriter {
|
||||
populationsStringBuilder.append(wellPopulations[i].toString());
|
||||
}
|
||||
String wellPopulationsString = populationsStringBuilder.toString();
|
||||
ga.put("well populations", DefaultAttribute.createAttribute(wellPopulationsString));
|
||||
ga.put("read depth", DefaultAttribute.createAttribute(data.getReadDepth().toString()));
|
||||
ga.put("read error rate", DefaultAttribute.createAttribute(data.getReadErrorRate().toString()));
|
||||
ga.put("error collision rate", DefaultAttribute.createAttribute(data.getErrorCollisionRate().toString()));
|
||||
return ga;
|
||||
attributes.put("well populations", DefaultAttribute.createAttribute(wellPopulationsString));
|
||||
attributes.put("read depth", DefaultAttribute.createAttribute(data.getReadDepth().toString()));
|
||||
attributes.put("read error rate", DefaultAttribute.createAttribute(data.getReadErrorRate().toString()));
|
||||
attributes.put("error collision rate", DefaultAttribute.createAttribute(data.getErrorCollisionRate().toString()));
|
||||
attributes.put("real sequence collision rate", DefaultAttribute.createAttribute(data.getRealSequenceCollisionRate()));
|
||||
return attributes;
|
||||
}
|
||||
|
||||
private Map<String, Attribute> createVertexAttributes(Vertex v){
|
||||
Map<String, Attribute> attributes = new HashMap<>();
|
||||
//sequence type
|
||||
attributes.put("type", DefaultAttribute.createAttribute(v.getType().name()));
|
||||
//sequence
|
||||
attributes.put("sequence", DefaultAttribute.createAttribute(v.getSequence()));
|
||||
//number of wells the sequence appears in
|
||||
attributes.put("occupancy", DefaultAttribute.createAttribute(v.getOccupancy()));
|
||||
//total number of times the sequence was read
|
||||
attributes.put("total read count", DefaultAttribute.createAttribute(v.getReadCount()));
|
||||
StringBuilder wellsAndReadCountsBuilder = new StringBuilder();
|
||||
Iterator<Map.Entry<Integer, Integer>> wellOccupancies = v.getWellOccupancies().entrySet().iterator();
|
||||
while (wellOccupancies.hasNext()) {
|
||||
Map.Entry<Integer, Integer> entry = wellOccupancies.next();
|
||||
wellsAndReadCountsBuilder.append(entry.getKey() + ":" + entry.getValue());
|
||||
if (wellOccupancies.hasNext()) {
|
||||
wellsAndReadCountsBuilder.append(", ");
|
||||
}
|
||||
}
|
||||
String wellsAndReadCounts = wellsAndReadCountsBuilder.toString();
|
||||
//the wells the sequence appears in and the read counts in those wells
|
||||
attributes.put("wells:read counts", DefaultAttribute.createAttribute(wellsAndReadCounts));
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public void writeGraphToFile() {
|
||||
@@ -72,15 +98,7 @@ public class GraphMLFileWriter {
|
||||
//Set graph attributes
|
||||
exporter.setGraphAttributeProvider( () -> graphAttributes);
|
||||
//set type, sequence, and occupancy attributes for each vertex
|
||||
//NEED TO ADD NEW FIELD FOR READ COUNT
|
||||
exporter.setVertexAttributeProvider( v -> {
|
||||
Map<String, Attribute> attributes = new HashMap<>();
|
||||
attributes.put("type", DefaultAttribute.createAttribute(v.getType().name()));
|
||||
attributes.put("sequence", DefaultAttribute.createAttribute(v.getSequence()));
|
||||
attributes.put("occupancy", DefaultAttribute.createAttribute(v.getOccupancy()));
|
||||
attributes.put("read count", DefaultAttribute.createAttribute(v.getReadCount()));
|
||||
return attributes;
|
||||
});
|
||||
exporter.setVertexAttributeProvider(this::createVertexAttributes);
|
||||
//register the attributes
|
||||
for(String s : graphAttributes.keySet()) {
|
||||
exporter.registerAttribute(s, AttributeCategory.GRAPH, AttributeType.STRING);
|
||||
@@ -88,7 +106,8 @@ public class GraphMLFileWriter {
|
||||
exporter.registerAttribute("type", AttributeCategory.NODE, AttributeType.STRING);
|
||||
exporter.registerAttribute("sequence", AttributeCategory.NODE, AttributeType.STRING);
|
||||
exporter.registerAttribute("occupancy", AttributeCategory.NODE, AttributeType.STRING);
|
||||
exporter.registerAttribute("read count", AttributeCategory.NODE, AttributeType.STRING);
|
||||
exporter.registerAttribute("total read count", AttributeCategory.NODE, AttributeType.STRING);
|
||||
exporter.registerAttribute("wells:read counts", AttributeCategory.NODE, AttributeType.STRING);
|
||||
//export the graph
|
||||
exporter.exportGraph(graph, writer);
|
||||
} catch(IOException ex){
|
||||
|
||||
@@ -11,13 +11,14 @@ public class GraphWithMapData implements java.io.Serializable {
|
||||
|
||||
private String sourceFilename;
|
||||
private final SimpleWeightedGraph graph;
|
||||
private Integer numWells;
|
||||
private Integer[] wellPopulations;
|
||||
private Integer alphaCount;
|
||||
private Integer betaCount;
|
||||
private int readDepth;
|
||||
private double readErrorRate;
|
||||
private double errorCollisionRate;
|
||||
private final int numWells;
|
||||
private final Integer[] wellPopulations;
|
||||
private final int alphaCount;
|
||||
private final int betaCount;
|
||||
private final int readDepth;
|
||||
private final double readErrorRate;
|
||||
private final double errorCollisionRate;
|
||||
private final double realSequenceCollisionRate;
|
||||
private final Map<String, String> distCellsMapAlphaKey;
|
||||
// private final Map<Integer, Integer> plateVtoAMap;
|
||||
// private final Map<Integer, Integer> plateVtoBMap;
|
||||
@@ -29,7 +30,8 @@ public class GraphWithMapData implements java.io.Serializable {
|
||||
|
||||
public GraphWithMapData(SimpleWeightedGraph graph, Integer numWells, Integer[] wellConcentrations,
|
||||
Map<String, String> distCellsMapAlphaKey, Integer alphaCount, Integer betaCount,
|
||||
Integer readDepth, Double readErrorRate, Double errorCollisionRate, Duration time){
|
||||
Integer readDepth, Double readErrorRate, Double errorCollisionRate,
|
||||
Double realSequenceCollisionRate, Duration time){
|
||||
|
||||
// Map<Integer, Integer> plateVtoAMap,
|
||||
// Map<Integer,Integer> plateVtoBMap, Map<Integer, Integer> plateAtoVMap,
|
||||
@@ -50,6 +52,7 @@ public class GraphWithMapData implements java.io.Serializable {
|
||||
this.readDepth = readDepth;
|
||||
this.readErrorRate = readErrorRate;
|
||||
this.errorCollisionRate = errorCollisionRate;
|
||||
this.realSequenceCollisionRate = realSequenceCollisionRate;
|
||||
this.time = time;
|
||||
}
|
||||
|
||||
@@ -122,4 +125,6 @@ public class GraphWithMapData implements java.io.Serializable {
|
||||
public Double getErrorCollisionRate() {
|
||||
return errorCollisionRate;
|
||||
}
|
||||
|
||||
public Double getRealSequenceCollisionRate() { return realSequenceCollisionRate; }
|
||||
}
|
||||
|
||||
@@ -255,6 +255,7 @@ public class InteractiveInterface {
|
||||
int readDepth = 1;
|
||||
double readErrorRate = 0.0;
|
||||
double errorCollisionRate = 0.0;
|
||||
double realSequenceCollisionRate = 0.0;
|
||||
try {
|
||||
String str = "\nGenerating bipartite weighted graph encoding occupancy overlap data ";
|
||||
str = str.concat("\nrequires a cell sample file and a sample plate file.");
|
||||
@@ -264,7 +265,6 @@ public class InteractiveInterface {
|
||||
System.out.print("\nPlease enter name of an existing sample plate file: ");
|
||||
plateFile = sc.next();
|
||||
System.out.println("\nEnable simulation of sequence read depth and sequence read errors? (y/n)");
|
||||
System.out.println("NOTE: sample plate data cannot be cached when simulating read errors");
|
||||
String ans = sc.next();
|
||||
Pattern pattern = Pattern.compile("(?:yes|y)", Pattern.CASE_INSENSITIVE);
|
||||
Matcher matcher = pattern.matcher(ans);
|
||||
@@ -272,25 +272,29 @@ public class InteractiveInterface {
|
||||
simulateReadDepth = true;
|
||||
}
|
||||
if (simulateReadDepth) {
|
||||
BiGpairSEQ.setCachePlate(false);
|
||||
BiGpairSEQ.clearPlateInMemory();
|
||||
System.out.print("\nPlease enter read depth (the integer number of reads per sequence): ");
|
||||
System.out.print("\nPlease enter the read depth (the integer number of times a sequence is read): ");
|
||||
readDepth = sc.nextInt();
|
||||
if(readDepth < 1) {
|
||||
throw new InputMismatchException("The read depth must be an integer >= 1");
|
||||
}
|
||||
System.out.print("\nPlease enter probability of a sequence read error (0.0 to 1.0): ");
|
||||
System.out.println("\nPlease enter the read error probability (0.0 to 1.0)");
|
||||
System.out.print("(The probability that a sequence will be misread): ");
|
||||
readErrorRate = sc.nextDouble();
|
||||
if(readErrorRate < 0.0 || readErrorRate > 1.0) {
|
||||
throw new InputMismatchException("The read error rate must be in the range [0.0, 1.0]");
|
||||
throw new InputMismatchException("The read error probability must be in the range [0.0, 1.0]");
|
||||
}
|
||||
System.out.println("\nPlease enter the probability of read error collision");
|
||||
System.out.println("(the likelihood that two read errors produce the same spurious sequence)");
|
||||
System.out.print("(0.0 to 1.0): ");
|
||||
System.out.println("\nPlease enter the error collision probability (0.0 to 1.0)");
|
||||
System.out.print("(The probability of a sequence being misread in a way it has been misread before): ");
|
||||
errorCollisionRate = sc.nextDouble();
|
||||
if(errorCollisionRate < 0.0 || errorCollisionRate > 1.0) {
|
||||
throw new InputMismatchException("The error collision probability must be an in the range [0.0, 1.0]");
|
||||
}
|
||||
System.out.println("\nPlease enter the real sequence collision probability (0.0 to 1.0)");
|
||||
System.out.print("(The probability that a (non-collision) misread produces a different, real sequence): ");
|
||||
realSequenceCollisionRate = sc.nextDouble();
|
||||
if(realSequenceCollisionRate < 0.0 || realSequenceCollisionRate > 1.0) {
|
||||
throw new InputMismatchException("The real sequence collision probability must be an in the range [0.0, 1.0]");
|
||||
}
|
||||
}
|
||||
System.out.println("\nThe graph and occupancy data will be written to a file.");
|
||||
System.out.print("Please enter a name for the output file: ");
|
||||
@@ -338,7 +342,8 @@ public class InteractiveInterface {
|
||||
System.out.println("Returning to main menu.");
|
||||
}
|
||||
else{
|
||||
GraphWithMapData data = Simulator.makeCDR3Graph(cellSample, plate, readDepth, readErrorRate, errorCollisionRate, true);
|
||||
GraphWithMapData data = Simulator.makeCDR3Graph(cellSample, plate, readDepth, readErrorRate,
|
||||
errorCollisionRate, realSequenceCollisionRate, true);
|
||||
assert filename != null;
|
||||
if(BiGpairSEQ.outputBinary()) {
|
||||
GraphDataObjectWriter dataWriter = new GraphDataObjectWriter(filename, data);
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
|
||||
/*
|
||||
TODO: Implement exponential distribution using inversion method - DONE
|
||||
TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read,
|
||||
with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence
|
||||
it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid
|
||||
of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another
|
||||
sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions
|
||||
Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's
|
||||
a real sequence collision (otherwise it's a new spurious sequence) - DONE
|
||||
TODO: Implement discrete frequency distributions using Vose's Alias Method
|
||||
*/
|
||||
|
||||
@@ -148,79 +155,83 @@ public class Plate {
|
||||
return wells;
|
||||
}
|
||||
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in all wells
|
||||
// public void assayWellsSequenceS(Map<String, Integer> sequences, int... sIndices){
|
||||
// this.assayWellsSequenceS(sequences, 0, size, sIndices);
|
||||
// }
|
||||
//
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in a specific well
|
||||
// public void assayWellsSequenceS(Map<String, Integer> sequences, int n, int... sIndices) {
|
||||
// this.assayWellsSequenceS(sequences, n, n+1, sIndices);
|
||||
// }
|
||||
//
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
|
||||
// public void assayWellsSequenceS(Map<String, Integer> sequences, int start, int end, int... sIndices) {
|
||||
// for(int sIndex: sIndices){
|
||||
// for(int i = start; i < end; i++){
|
||||
// countSequences(sequences, wells.get(i), sIndex);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
|
||||
// private void countSequences(Map<String, Integer> wellMap, List<String[]> well, int... sIndices) {
|
||||
// for(String[] cell : well) {
|
||||
// for(int sIndex: sIndices){
|
||||
// //skip dropout sequences, which have value -1
|
||||
// if(!"-1".equals(cell[sIndex])){
|
||||
// wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
//For the sequences at cell indices sIndices, counts number of unique sequences in all well into the given map
|
||||
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells.
|
||||
//Also simulates sequence read errors with given probabilities.
|
||||
//Returns a map of SequenceRecords containing plate data for all sequences read.
|
||||
//TODO actually implement usage of misreadSequences - DONE
|
||||
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
|
||||
Double errorCollisionRate, int... sIndices) {
|
||||
Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) {
|
||||
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
|
||||
Map<String, Integer> distinctMisreadCounts = new HashMap<>();
|
||||
//Map of all real sequences read. Keys are sequences, values are ways sequence has been misread.
|
||||
Map<String, List<String>> sequencesAndMisreads = new HashMap<>();
|
||||
//Map of all sequences read. Keys are sequences, values are associated SequenceRecords
|
||||
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
|
||||
//get list of all distinct, real sequences
|
||||
String[] realSequences = assayWells(sIndices).toArray(new String[0]);
|
||||
for (int well = 0; well < size; well++) {
|
||||
for (String[] cell : wells.get(well)) {
|
||||
for (int sIndex : sIndices) {
|
||||
for (String[] cell: wells.get(well)) {
|
||||
for (int sIndex: sIndices) {
|
||||
//the sequence being read
|
||||
String currentSequence = cell[sIndex];
|
||||
//skip dropout sequences, which have value -1
|
||||
if (!"-1".equals(cell[sIndex])) {
|
||||
if (!"-1".equals(currentSequence)) {
|
||||
//keep rereading the sequence until the read depth is reached
|
||||
for (int j = 0; j < readDepth; j++) {
|
||||
//Misread sequence
|
||||
//The sequence is misread
|
||||
if (rand.nextDouble() < readErrorRate) {
|
||||
StringBuilder spurious = new StringBuilder(cell[sIndex]);
|
||||
//if this sequence hasn't been misread before, or the read error is unique,
|
||||
//append one more "*" than has been appended before
|
||||
if (rand.nextDouble() > errorCollisionRate || !distinctMisreadCounts.containsKey(cell[sIndex])) {
|
||||
distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) {
|
||||
spurious.append("*");
|
||||
}
|
||||
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
|
||||
tmp.addRead(well);
|
||||
sequenceMap.put(spurious.toString(), tmp);
|
||||
//The sequence hasn't been read or misread before
|
||||
if (!sequencesAndMisreads.containsKey(currentSequence)) {
|
||||
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
||||
}
|
||||
//if this is a read error collision, randomly choose a number of "*"s that has been appended before
|
||||
else {
|
||||
int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
|
||||
for (int k = 0; k < starCount; k++) {
|
||||
spurious.append("*");
|
||||
//The specific misread hasn't happened before
|
||||
if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) {
|
||||
//The misread doesn't collide with a real sequence already on the plate and some sequences have already been read
|
||||
if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){
|
||||
StringBuilder spurious = new StringBuilder(currentSequence);
|
||||
for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) {
|
||||
spurious.append("*");
|
||||
}
|
||||
//New sequence record for the spurious sequence
|
||||
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
|
||||
tmp.addRead(well);
|
||||
sequenceMap.put(spurious.toString(), tmp);
|
||||
//add spurious sequence to list of misreads for the real sequence
|
||||
sequencesAndMisreads.get(currentSequence).add(spurious.toString());
|
||||
}
|
||||
//The misread collides with a real sequence already read from plate
|
||||
else {
|
||||
String wrongSequence;
|
||||
do{
|
||||
//get a random real sequence that's been read from the plate before
|
||||
int index = rand.nextInt(realSequences.length);
|
||||
wrongSequence = realSequences[index];
|
||||
//make sure it's not accidentally the *right* sequence
|
||||
//Also that it's not a wrong sequence already in the misread list
|
||||
} while(currentSequence.equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence));
|
||||
//update the SequenceRecord for wrongSequence
|
||||
sequenceMap.get(wrongSequence).addRead(well);
|
||||
//add wrongSequence to the misreads for currentSequence
|
||||
sequencesAndMisreads.get(currentSequence).add(wrongSequence);
|
||||
}
|
||||
sequenceMap.get(spurious.toString()).addRead(well);
|
||||
}
|
||||
}
|
||||
//sequence is read correctly
|
||||
//The sequence is read correctly
|
||||
else {
|
||||
if (!sequenceMap.containsKey(cell[sIndex])) {
|
||||
SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]);
|
||||
//the sequence hasn't been read before
|
||||
if (!sequenceMap.containsKey(currentSequence)) {
|
||||
//create new record for the sequence
|
||||
SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]);
|
||||
//add this read to the sequence record
|
||||
tmp.addRead(well);
|
||||
sequenceMap.put(cell[sIndex], tmp);
|
||||
} else {
|
||||
sequenceMap.get(cell[sIndex]).addRead(well);
|
||||
//add the sequence and its record to the sequence map
|
||||
sequenceMap.put(currentSequence, tmp);
|
||||
//add the sequence to the sequences and misreads map
|
||||
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
||||
}
|
||||
//the sequence has been read before
|
||||
else {
|
||||
//get the sequence's record and add this read to it
|
||||
sequenceMap.get(currentSequence).addRead(well);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -231,97 +242,17 @@ public class Plate {
|
||||
return sequenceMap;
|
||||
}
|
||||
|
||||
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in all wells
|
||||
// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
|
||||
// public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
|
||||
// int readDepth, double readErrorProb, double errorCollisionProb, int... sIndices) {
|
||||
// this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, 0, size, sIndices);
|
||||
// }
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in a specific of wells
|
||||
// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
|
||||
// public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
|
||||
// int readDepth, double readErrorProb, double errorCollisionProb,
|
||||
// int n, int... sIndices) {
|
||||
// this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, n, n+1, sIndices);
|
||||
// }
|
||||
//
|
||||
// //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
|
||||
// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
|
||||
// public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
|
||||
// int readDepth, double readErrorProb, double errorCollisionProb,
|
||||
// int start, int end, int... sIndices) {
|
||||
// for(int sIndex: sIndices){
|
||||
// for(int i = start; i < end; i++){
|
||||
// countSequencesWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, wells.get(i), sIndex);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
|
||||
// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
|
||||
// //NOTE: this function changes the content of the well, adding spurious cells to contain the misread sequences
|
||||
// //(this is necessary because, in the simulation, the plate is read multiple times, but random misreads can only
|
||||
// //be simulated once).
|
||||
// //(Possibly I should refactor all of this to only require a single plate assay, to speed things up. Or at least
|
||||
// //to see if it would speed things up.)
|
||||
// private void countSequencesWithReadDepth(Map<String, Integer> distinctMisreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
|
||||
// int readDepth, double readErrorProb, double errorCollisionProb,
|
||||
// List<String[]> well, int... sIndices) {
|
||||
// //list of spurious cells to add to well after counting
|
||||
// List<String[]> spuriousCells = new ArrayList<>();
|
||||
// for(String[] cell : well) {
|
||||
// //new potential spurious cell for each cell that gets read
|
||||
// String[] spuriousCell = new String[SequenceType.values().length];
|
||||
// //initialize spurious cell with all dropout sequences
|
||||
// Arrays.fill(spuriousCell, "-1");
|
||||
// //has a read error occurred?
|
||||
// boolean readError = false;
|
||||
// for(int sIndex: sIndices){
|
||||
// //skip dropout sequences, which have value "-1"
|
||||
// if(!"-1".equals(cell[sIndex])){
|
||||
// Map<String, Integer> sequencesWithReadCounts = new LinkedHashMap<>();
|
||||
// for(int i = 0; i < readDepth; i++) {
|
||||
// if (rand.nextDouble() <= readErrorProb) {
|
||||
// readError = true;
|
||||
// //Read errors are represented by appending "*"s to the end of the sequence some number of times
|
||||
// StringBuilder spurious = new StringBuilder(cell[sIndex]);
|
||||
// //if this sequence hasn't been misread before, or the read error is unique,
|
||||
// //append one more "*" than has been appended before
|
||||
// if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionProb) {
|
||||
// distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
// for (int j = 0; j < distinctMisreadCounts.get(cell[sIndex]); j++) {
|
||||
// spurious.append("*");
|
||||
// }
|
||||
// }
|
||||
// //if this is a read error collision, randomly choose a number of "*"s that has been appended before
|
||||
// else {
|
||||
// int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
|
||||
// for (int j = 0; j < starCount; j++) {
|
||||
// spurious.append("*");
|
||||
// }
|
||||
// }
|
||||
// sequencesWithReadCounts.merge(spurious.toString(), 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
// //add spurious sequence to spurious cell
|
||||
// spuriousCell[sIndex] = spurious.toString();
|
||||
// }
|
||||
// else {
|
||||
// sequencesWithReadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
// }
|
||||
// }
|
||||
// for(String seq : sequencesWithReadCounts.keySet()) {
|
||||
// occupancyMap.merge(seq, 1, (oldValue, newValue) -> oldValue + newValue);
|
||||
// readCountMap.merge(seq, sequencesWithReadCounts.get(seq), (oldValue, newValue) -> oldValue + newValue);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// if (readError) { //only add a new spurious cell if there was a read error
|
||||
// spuriousCells.add(spuriousCell);
|
||||
// }
|
||||
// }
|
||||
// //add all spurious cells to the well
|
||||
// well.addAll(spuriousCells);
|
||||
// }
|
||||
private HashSet<String> assayWells(int[] indices) {
|
||||
HashSet<String> allSequences = new HashSet<>();
|
||||
for (List<String[]> well: wells) {
|
||||
for (String[] cell: well) {
|
||||
for(int index: indices) {
|
||||
allSequences.add(cell[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return allSequences;
|
||||
}
|
||||
|
||||
public String getSourceFileName() {
|
||||
return sourceFile;
|
||||
|
||||
@@ -27,7 +27,8 @@ public class Simulator implements GraphModificationFunctions {
|
||||
|
||||
|
||||
public static GraphWithMapData makeCDR3Graph(CellSample cellSample, Plate samplePlate, int readDepth,
|
||||
double readErrorRate, double errorCollisionRate, boolean verbose) {
|
||||
double readErrorRate, double errorCollisionRate,
|
||||
double realSequenceCollisionRate, boolean verbose) {
|
||||
//start timing
|
||||
Instant start = Instant.now();
|
||||
int[] alphaIndices = {SequenceType.CDR3_ALPHA.ordinal()};
|
||||
@@ -44,11 +45,11 @@ public class Simulator implements GraphModificationFunctions {
|
||||
//Make linkedHashMap keyed to sequences, values are SequenceRecords reflecting plate statistics
|
||||
if(verbose){System.out.println("Making sample plate sequence maps");}
|
||||
Map<String, SequenceRecord> alphaSequences = samplePlate.countSequences(readDepth, readErrorRate,
|
||||
errorCollisionRate, alphaIndices);
|
||||
errorCollisionRate, realSequenceCollisionRate, alphaIndices);
|
||||
int alphaCount = alphaSequences.size();
|
||||
if(verbose){System.out.println("Alphas sequences read: " + alphaCount);}
|
||||
Map<String, SequenceRecord> betaSequences = samplePlate.countSequences(readDepth, readErrorRate,
|
||||
errorCollisionRate, betaIndices);
|
||||
errorCollisionRate, realSequenceCollisionRate, betaIndices);
|
||||
int betaCount = betaSequences.size();
|
||||
if(verbose){System.out.println("Betas sequences read: " + betaCount);}
|
||||
if(verbose){System.out.println("Sample plate sequence maps made");}
|
||||
@@ -131,7 +132,7 @@ public class Simulator implements GraphModificationFunctions {
|
||||
Duration time = Duration.between(start, stop);
|
||||
//create GraphWithMapData object
|
||||
GraphWithMapData output = new GraphWithMapData(graph, numWells, samplePlate.getPopulations(), distCellsMapAlphaKey,
|
||||
alphaCount, betaCount, readDepth, readErrorRate, errorCollisionRate, time);
|
||||
alphaCount, betaCount, readDepth, readErrorRate, errorCollisionRate, realSequenceCollisionRate, time);
|
||||
//Set source file name in graph to name of sample plate
|
||||
output.setSourceFilename(samplePlate.getFilename());
|
||||
//return GraphWithMapData object
|
||||
@@ -307,6 +308,7 @@ public class Simulator implements GraphModificationFunctions {
|
||||
metadata.put("sequence read depth", data.getReadDepth().toString());
|
||||
metadata.put("sequence read error rate", data.getReadErrorRate().toString());
|
||||
metadata.put("read error collision rate", data.getErrorCollisionRate().toString());
|
||||
metadata.put("real sequence collision rate", data.getRealSequenceCollisionRate().toString());
|
||||
metadata.put("total alphas read from plate", data.getAlphaCount().toString());
|
||||
metadata.put("total betas read from plate", data.getBetaCount().toString());
|
||||
//HARD CODED, PARAMETERIZE LATER
|
||||
|
||||
@@ -32,6 +32,8 @@ public class Vertex implements Serializable, Comparable<Vertex> {
|
||||
|
||||
public Integer getReadCount() { return record.getReadCount(); }
|
||||
|
||||
public Integer getReadCount(Integer well) { return record.getReadCount(well); }
|
||||
|
||||
public Map<Integer, Integer> getWellOccupancies() { return record.getWellOccupancies(); }
|
||||
|
||||
@Override //adapted from JGraphT example code
|
||||
|
||||
Reference in New Issue
Block a user