Rewrite countSequences to allow for collision with real sequences on misreads
This commit is contained in:
@@ -2,6 +2,13 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
TODO: Implement exponential distribution using inversion method - DONE
|
TODO: Implement exponential distribution using inversion method - DONE
|
||||||
|
TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read,
|
||||||
|
with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence
|
||||||
|
it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid
|
||||||
|
of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another
|
||||||
|
sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions
|
||||||
|
Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's
|
||||||
|
a real sequence collision (otherwise it's a new spurious sequence) - DONE
|
||||||
TODO: Implement discrete frequency distributions using Vose's Alias Method
|
TODO: Implement discrete frequency distributions using Vose's Alias Method
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -148,49 +155,76 @@ public class Plate {
|
|||||||
return wells;
|
return wells;
|
||||||
}
|
}
|
||||||
|
|
||||||
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells into the given map
|
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells.
|
||||||
|
//Also simulates sequence read errors with given probabilities.
|
||||||
|
//Returns a map of SequenceRecords containing plate data for all sequences read.
|
||||||
|
//TODO actually implement usage of misreadSequences - DONE
|
||||||
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
|
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
|
||||||
Double errorCollisionRate, int... sIndices) {
|
Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) {
|
||||||
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
|
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
|
||||||
Map<String, Integer> distinctMisreadCounts = new HashMap<>();
|
//Map of all sequences read. Keys are sequences, values are ways sequence has been misread.
|
||||||
|
Map<String, List<String>> sequencesAndMisreads = new HashMap<>();
|
||||||
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
|
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
|
||||||
for (int well = 0; well < size; well++) {
|
for (int well = 0; well < size; well++) {
|
||||||
for (String[] cell : wells.get(well)) {
|
for (String[] cell : wells.get(well)) {
|
||||||
for (int sIndex : sIndices) {
|
for (int sIndex : sIndices) {
|
||||||
|
String currentSequence = cell[sIndex];
|
||||||
//skip dropout sequences, which have value -1
|
//skip dropout sequences, which have value -1
|
||||||
if (!"-1".equals(cell[sIndex])) {
|
if (!"-1".equals(currentSequence)) {
|
||||||
for (int j = 0; j < readDepth; j++) {
|
for (int j = 0; j < readDepth; j++) {
|
||||||
//Misread sequence
|
//The sequence is misread
|
||||||
if (rand.nextDouble() < readErrorRate) {
|
if (rand.nextDouble() < readErrorRate) {
|
||||||
StringBuilder spurious = new StringBuilder(cell[sIndex]);
|
//The sequence hasn't been read or misread before
|
||||||
//if this sequence hasn't been misread before, or the read error is unique,
|
if (!sequencesAndMisreads.containsKey(currentSequence)) {
|
||||||
//append one more "*" than has been appended before
|
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
||||||
if (rand.nextDouble() > errorCollisionRate || !distinctMisreadCounts.containsKey(cell[sIndex])) {
|
|
||||||
distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
|
|
||||||
for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) {
|
|
||||||
spurious.append("*");
|
|
||||||
}
|
|
||||||
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
|
|
||||||
tmp.addRead(well);
|
|
||||||
sequenceMap.put(spurious.toString(), tmp);
|
|
||||||
}
|
}
|
||||||
//if this is a read error collision, randomly choose a number of "*"s that has been appended before
|
//The specific misread hasn't happened before
|
||||||
else {
|
if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) {
|
||||||
int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
|
//The misread doesn't collide with a real sequence already on the plate
|
||||||
for (int k = 0; k < starCount; k++) {
|
if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){
|
||||||
spurious.append("*");
|
StringBuilder spurious = new StringBuilder(currentSequence);
|
||||||
|
for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) {
|
||||||
|
spurious.append("*");
|
||||||
|
}
|
||||||
|
//New sequence record for the spurious sequence
|
||||||
|
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
|
||||||
|
tmp.addRead(well);
|
||||||
|
sequenceMap.put(spurious.toString(), tmp);
|
||||||
|
//add spurious sequence to list of misreads for the real sequence
|
||||||
|
sequencesAndMisreads.get(currentSequence).add(spurious.toString());
|
||||||
|
}
|
||||||
|
//The misread collides with a real sequence already read from plate
|
||||||
|
else {
|
||||||
|
String wrongSequence;
|
||||||
|
do{
|
||||||
|
//get a random real sequence that's been read from the plate before
|
||||||
|
int index = rand.nextInt(sequencesAndMisreads.size());
|
||||||
|
wrongSequence = sequencesAndMisreads.keySet().toArray(new String[0])[index];
|
||||||
|
//make sure it's not accidentally the *right* sequence
|
||||||
|
//Also that it's not a wrong sequence already in the misread list
|
||||||
|
} while(cell[sIndex].equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence));
|
||||||
|
sequenceMap.get(wrongSequence).addRead(well);
|
||||||
|
|
||||||
}
|
}
|
||||||
sequenceMap.get(spurious.toString()).addRead(well);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//sequence is read correctly
|
//The sequence is read correctly
|
||||||
else {
|
else {
|
||||||
if (!sequenceMap.containsKey(cell[sIndex])) {
|
//the sequence hasn't been read before
|
||||||
SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]);
|
if (!sequenceMap.containsKey(currentSequence)) {
|
||||||
|
//create new record for the sequence
|
||||||
|
SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]);
|
||||||
|
//add this read to the sequence record
|
||||||
tmp.addRead(well);
|
tmp.addRead(well);
|
||||||
sequenceMap.put(cell[sIndex], tmp);
|
//add the sequence and its record to the sequence map
|
||||||
} else {
|
sequenceMap.put(currentSequence, tmp);
|
||||||
sequenceMap.get(cell[sIndex]).addRead(well);
|
//add the sequence to the sequences and misreads map
|
||||||
|
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
||||||
|
}
|
||||||
|
//the sequence has been read before
|
||||||
|
else {
|
||||||
|
//get the sequence's record and add this read to it
|
||||||
|
sequenceMap.get(currentSequence).addRead(well);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user