Rewrite countSequences to allow for collision with real sequences on misreads

This commit is contained in:
eugenefischer
2022-09-28 17:44:26 -05:00
parent 9dacd8cd34
commit 0b36e2b742

View File

@@ -2,6 +2,13 @@
/*
TODO: Implement exponential distribution using inversion method - DONE
TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read,
with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence
it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid
of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another
sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions
Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's
a real sequence collision (otherwise it's a new spurious sequence) - DONE
TODO: Implement discrete frequency distributions using Vose's Alias Method
*/
@@ -148,49 +155,76 @@ public class Plate {
return wells;
}
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells into the given map
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells.
//Also simulates sequence read errors with given probabilities.
//Returns a map of SequenceRecords containing plate data for all sequences read.
//TODO actually implement usage of misreadSequences - DONE
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
Double errorCollisionRate, int... sIndices) {
Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) {
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
Map<String, Integer> distinctMisreadCounts = new HashMap<>();
//Map of all sequences read. Keys are sequences, values are ways sequence has been misread.
Map<String, List<String>> sequencesAndMisreads = new HashMap<>();
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
for (int well = 0; well < size; well++) {
for (String[] cell : wells.get(well)) {
for (int sIndex : sIndices) {
String currentSequence = cell[sIndex];
//skip dropout sequences, which have value -1
if (!"-1".equals(cell[sIndex])) {
if (!"-1".equals(currentSequence)) {
for (int j = 0; j < readDepth; j++) {
//Misread sequence
//The sequence is misread
if (rand.nextDouble() < readErrorRate) {
StringBuilder spurious = new StringBuilder(cell[sIndex]);
//if this sequence hasn't been misread before, or the read error is unique,
//append one more "*" than has been appended before
if (rand.nextDouble() > errorCollisionRate || !distinctMisreadCounts.containsKey(cell[sIndex])) {
distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) {
//The sequence hasn't been read or misread before
if (!sequencesAndMisreads.containsKey(currentSequence)) {
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
}
//The specific misread hasn't happened before
if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) {
//The misread doesn't collide with a real sequence already on the plate
if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){
StringBuilder spurious = new StringBuilder(currentSequence);
for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) {
spurious.append("*");
}
//New sequence record for the spurious sequence
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
tmp.addRead(well);
sequenceMap.put(spurious.toString(), tmp);
//add spurious sequence to list of misreads for the real sequence
sequencesAndMisreads.get(currentSequence).add(spurious.toString());
}
//if this is a read error collision, randomly choose a number of "*"s that has been appended before
//The misread collides with a real sequence already read from plate
else {
int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
for (int k = 0; k < starCount; k++) {
spurious.append("*");
}
sequenceMap.get(spurious.toString()).addRead(well);
String wrongSequence;
do{
//get a random real sequence that's been read from the plate before
int index = rand.nextInt(sequencesAndMisreads.size());
wrongSequence = sequencesAndMisreads.keySet().toArray(new String[0])[index];
//make sure it's not accidentally the *right* sequence
//Also that it's not a wrong sequence already in the misread list
} while(cell[sIndex].equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence));
sequenceMap.get(wrongSequence).addRead(well);
}
}
//sequence is read correctly
}
//The sequence is read correctly
else {
if (!sequenceMap.containsKey(cell[sIndex])) {
SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]);
//the sequence hasn't been read before
if (!sequenceMap.containsKey(currentSequence)) {
//create new record for the sequence
SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]);
//add this read to the sequence record
tmp.addRead(well);
sequenceMap.put(cell[sIndex], tmp);
} else {
sequenceMap.get(cell[sIndex]).addRead(well);
//add the sequence and its record to the sequence map
sequenceMap.put(currentSequence, tmp);
//add the sequence to the sequences and misreads map
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
}
//the sequence has been read before
else {
//get the sequence's record and add this read to it
sequenceMap.get(currentSequence).addRead(well);
}
}
}