From 0b36e2b74200cd4e42efb32635f48d959ebcb4d1 Mon Sep 17 00:00:00 2001 From: eugenefischer <66030419+eugenefischer@users.noreply.github.com> Date: Wed, 28 Sep 2022 17:44:26 -0500 Subject: [PATCH] Rewrite countSequences to allow for collision with real sequences on misreads --- src/main/java/Plate.java | 90 +++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/src/main/java/Plate.java b/src/main/java/Plate.java index 572ea57..3ba7a1b 100644 --- a/src/main/java/Plate.java +++ b/src/main/java/Plate.java @@ -2,6 +2,13 @@ /* TODO: Implement exponential distribution using inversion method - DONE +TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read, +with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence +it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid +of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another +sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions +Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's +a real sequence collision (otherwise it's a new spurious sequence) - DONE TODO: Implement discrete frequency distributions using Vose's Alias Method */ @@ -148,49 +155,76 @@ public class Plate { return wells; } - //For the sequences at cell indices sIndices, counts number of unique sequences in all wells into the given map + //For the sequences at cell indices sIndices, counts number of unique sequences in all wells. + //Also simulates sequence read errors with given probabilities. + //Returns a map of SequenceRecords containing plate data for all sequences read. + //TODO actually implement usage of misreadSequences - DONE public Map countSequences(Integer readDepth, Double readErrorRate, - Double errorCollisionRate, int... sIndices) { + Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) { SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]); - Map distinctMisreadCounts = new HashMap<>(); + //Map of all sequences read. Keys are sequences, values are ways sequence has been misread. + Map> sequencesAndMisreads = new HashMap<>(); Map sequenceMap = new LinkedHashMap<>(); for (int well = 0; well < size; well++) { for (String[] cell : wells.get(well)) { for (int sIndex : sIndices) { + String currentSequence = cell[sIndex]; //skip dropout sequences, which have value -1 - if (!"-1".equals(cell[sIndex])) { + if (!"-1".equals(currentSequence)) { for (int j = 0; j < readDepth; j++) { - //Misread sequence + //The sequence is misread if (rand.nextDouble() < readErrorRate) { - StringBuilder spurious = new StringBuilder(cell[sIndex]); - //if this sequence hasn't been misread before, or the read error is unique, - //append one more "*" than has been appended before - if (rand.nextDouble() > errorCollisionRate || !distinctMisreadCounts.containsKey(cell[sIndex])) { - distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); - for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) { - spurious.append("*"); - } - SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]); - tmp.addRead(well); - sequenceMap.put(spurious.toString(), tmp); + //The sequence hasn't been read or misread before + if (!sequencesAndMisreads.containsKey(currentSequence)) { + sequencesAndMisreads.put(currentSequence, new ArrayList<>()); } - //if this is a read error collision, randomly choose a number of "*"s that has been appended before - else { - int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex])); - for (int k = 0; k < starCount; k++) { - spurious.append("*"); + //The specific misread hasn't happened before + if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) { + //The misread doesn't collide with a real sequence already on the plate + if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){ + StringBuilder spurious = new StringBuilder(currentSequence); + for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) { + spurious.append("*"); + } + //New sequence record for the spurious sequence + SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]); + tmp.addRead(well); + sequenceMap.put(spurious.toString(), tmp); + //add spurious sequence to list of misreads for the real sequence + sequencesAndMisreads.get(currentSequence).add(spurious.toString()); + } + //The misread collides with a real sequence already read from plate + else { + String wrongSequence; + do{ + //get a random real sequence that's been read from the plate before + int index = rand.nextInt(sequencesAndMisreads.size()); + wrongSequence = sequencesAndMisreads.keySet().toArray(new String[0])[index]; + //make sure it's not accidentally the *right* sequence + //Also that it's not a wrong sequence already in the misread list + } while(cell[sIndex].equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence)); + sequenceMap.get(wrongSequence).addRead(well); + } - sequenceMap.get(spurious.toString()).addRead(well); } } - //sequence is read correctly + //The sequence is read correctly else { - if (!sequenceMap.containsKey(cell[sIndex])) { - SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]); + //the sequence hasn't been read before + if (!sequenceMap.containsKey(currentSequence)) { + //create new record for the sequence + SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]); + //add this read to the sequence record tmp.addRead(well); - sequenceMap.put(cell[sIndex], tmp); - } else { - sequenceMap.get(cell[sIndex]).addRead(well); + //add the sequence and its record to the sequence map + sequenceMap.put(currentSequence, tmp); + //add the sequence to the sequences and misreads map + sequencesAndMisreads.put(currentSequence, new ArrayList<>()); + } + //the sequence has been read before + else { + //get the sequence's record and add this read to it + sequenceMap.get(currentSequence).addRead(well); } } }