Refactor to simplify graph creation code

2022-09-28 00:07:59 -05:00
parent 35a519d499
commit 88b6c79caa
5 changed files with 330 additions and 256 deletions
--- a/src/main/java/CommandLineInterface.java
+++ b/src/main/java/CommandLineInterface.java
@@ -159,7 +159,7 @@ public class CommandLineInterface {
                if (line.hasOption("coll")) {
                    errorCollisionRate = Double.parseDouble(line.getOptionValue("coll"));
                }
-                graph = Simulator.makeGraph(cells, plate, readDepth, readErrorRate, errorCollisionRate, false);
+                graph = Simulator.makeCDR3Graph(cells, plate, readDepth, readErrorRate, errorCollisionRate, false);
                if (!line.hasOption("no-binary")) { //output binary file unless told not to
                    GraphDataObjectWriter writer = new GraphDataObjectWriter(outputFilename, graph, false);
                    writer.writeDataToFile();
--- a/src/main/java/InteractiveInterface.java
+++ b/src/main/java/InteractiveInterface.java
@@ -338,7 +338,7 @@ public class InteractiveInterface {
            System.out.println("Returning to main menu.");
        }
        else{
-            GraphWithMapData data = Simulator.makeGraph(cellSample, plate, readDepth, readErrorRate, errorCollisionRate, true);
+            GraphWithMapData data = Simulator.makeCDR3Graph(cellSample, plate, readDepth, readErrorRate, errorCollisionRate, true);
            assert filename != null;
            if(BiGpairSEQ.outputBinary()) {
                GraphDataObjectWriter dataWriter = new GraphDataObjectWriter(filename, data);
--- a/src/main/java/Plate.java
+++ b/src/main/java/Plate.java
@@ -148,127 +148,196 @@ public class Plate {
        return wells;
    }
-    //returns a map of the counts of the sequence at cell index sIndex, in all wells
+//    //returns a map of the counts of the sequence at cell index sIndex, in all wells
-    public void assayWellsSequenceS(Map<String, Integer> sequences, int... sIndices){
+//    public void assayWellsSequenceS(Map<String, Integer> sequences, int... sIndices){
-        this.assayWellsSequenceS(sequences, 0, size, sIndices);
+//        this.assayWellsSequenceS(sequences, 0, size, sIndices);
-    }
+//    }
 //
 //    //returns a map of the counts of the sequence at cell index sIndex, in a specific well
 //    public void assayWellsSequenceS(Map<String, Integer> sequences, int n, int... sIndices) {
 //        this.assayWellsSequenceS(sequences, n, n+1, sIndices);
 //    }
 //
 //    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
 //    public void assayWellsSequenceS(Map<String, Integer> sequences, int start, int end, int... sIndices) {
 //        for(int sIndex: sIndices){
 //            for(int i = start; i < end; i++){
 //                countSequences(sequences, wells.get(i), sIndex);
 //            }
 //        }
 //    }
 //    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
 //    private void countSequences(Map<String, Integer> wellMap, List<String[]> well, int... sIndices) {
 //        for(String[] cell : well) {
 //            for(int sIndex: sIndices){
 //                //skip dropout sequences, which have value -1
 //                if(!"-1".equals(cell[sIndex])){
 //                    wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
 //                }
 //            }
 //        }
 //    }
-    //returns a map of the counts of the sequence at cell index sIndex, in a specific well
+    //For the sequences at cell indices sIndices, counts number of unique sequences in all well into the given map
-    public void assayWellsSequenceS(Map<String, Integer> sequences, int n, int... sIndices) {
+    public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
-        this.assayWellsSequenceS(sequences, n, n+1, sIndices);
+                                                       Double errorCollisionRate, int... sIndices) {
-    }
+        SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
        Map<String, Integer> distinctMisreadCounts = new HashMap<>();
        Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
        //booleans for testing
        Boolean first= false;
        Boolean second = false;
        Boolean third = false;
        Boolean fourth = false;
-    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
+        for (int well = 0; well < size; well++) {
-    public void assayWellsSequenceS(Map<String, Integer> sequences, int start, int end, int... sIndices) {
+            first = true;
-        for(int sIndex: sIndices){
+            for (String[] cell : wells.get(well)) {
-            for(int i = start; i < end; i++){
+                second = true;
-                countSequences(sequences, wells.get(i), sIndex);
+                for (int sIndex : sIndices) {
-            }
+                    third = true;
-        }
+                    //skip dropout sequences, which have value -1
-    }
+                    if (!"-1".equals(cell[sIndex])) {
-    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
+                        for (int j = 0; j < readDepth; j++) {
-    private void countSequences(Map<String, Integer> wellMap, List<String[]> well, int... sIndices) {
+                            //Misread sequence
-        for(String[] cell : well) {
+                            if (rand.nextDouble() < readErrorRate) {
-            for(int sIndex: sIndices){
+                                StringBuilder spurious = new StringBuilder(cell[sIndex]);
-                //skip dropout sequences, which have value -1
+                                //if this sequence hasn't been misread before, or the read error is unique,
-                if(!"-1".equals(cell[sIndex])){
+                                //append one more "*" than has been appended before
-                    wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
+                                if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionRate) {
-                }
+                                    distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
-            }
+                                    for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) {
-        }
+                                        spurious.append("*");
-    }
+                                    }
-
+                                    SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
-    //returns a map of the counts of the sequence at cell index sIndex, in all wells
+                                    tmp.addRead(well);
-    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
+                                    sequenceMap.put(spurious.toString(), tmp);
-    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
+                                }
-                                                 int readDepth, double readErrorProb, double errorCollisionProb, int... sIndices) {
+                                //if this is a read error collision, randomly choose a number of "*"s that has been appended before
-        this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, 0, size, sIndices);
+                                else {
-    }
+                                    int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
-    //returns a map of the counts of the sequence at cell index sIndex, in a specific of wells
+                                    for (int k = 0; k < starCount; k++) {
-    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
+                                        spurious.append("*");
-    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
+                                    }
-                                                 int readDepth, double readErrorProb, double errorCollisionProb,
+                                    sequenceMap.get(spurious.toString()).addRead(well);
                                                 int n, int... sIndices) {
       this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, n, n+1, sIndices);
    }
    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
                                                 int readDepth, double readErrorProb, double errorCollisionProb,
                                                 int start, int end, int... sIndices) {
        for(int sIndex: sIndices){
            for(int i = start; i < end; i++){
                countSequencesWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, wells.get(i), sIndex);
            }
        }
    }
    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
    //NOTE: this function changes the content of the well, adding spurious cells to contain the misread sequences
    //(this is necessary because, in the simulation, the plate is read multiple times, but random misreads can only
    //be simulated once).
    //(Possibly I should refactor all of this to only require a single plate assay, to speed things up. Or at least
    //to see if it would speed things up.)
    private void countSequencesWithReadDepth(Map<String, Integer> distinctMisreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
                                             int readDepth, double readErrorProb, double errorCollisionProb,
                                             List<String[]> well, int... sIndices) {
        //list of spurious cells to add to well after counting
        List<String[]> spuriousCells = new ArrayList<>();
        for(String[] cell : well) {
            //new potential spurious cell for each cell that gets read
            String[] spuriousCell = new String[SequenceType.values().length];
            //initialize spurious cell with all dropout sequences
            Arrays.fill(spuriousCell, "-1");
            //has a read error occurred?
            boolean readError = false;
            for(int sIndex: sIndices){
                //skip dropout sequences, which have value "-1"
                if(!"-1".equals(cell[sIndex])){
                    Map<String, Integer> sequencesWithReadCounts = new LinkedHashMap<>();
                    for(int i = 0; i < readDepth; i++) {
                        if (rand.nextDouble() <= readErrorProb) {
                            readError = true;
                            //Read errors are represented by appending "*"s to the end of the sequence some number of times
                            StringBuilder spurious = new StringBuilder(cell[sIndex]);
                            //if this sequence hasn't been misread before, or the read error is unique,
                            //append one more "*" than has been appended before
                            if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionProb) {
                                distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
                                for (int j = 0; j < distinctMisreadCounts.get(cell[sIndex]); j++) {
                                    spurious.append("*");
                                }
                            }
-                            //if this is a read error collision, randomly choose a number of "*"s that has been appended before
+                            //sequence is read correctly
                            else {
-                                int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
+                                fourth = true;
-                                for (int j = 0; j < starCount; j++) {
+                                if (!sequenceMap.containsKey(cell[sIndex])) {
-                                    spurious.append("*");
+                                    SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]);
                                    tmp.addRead(well);
                                    sequenceMap.put(cell[sIndex], tmp);
                                } else {
                                    sequenceMap.get(cell[sIndex]).addRead(well);
                                }
                            }
                            sequencesWithReadCounts.merge(spurious.toString(), 1, (oldValue, newValue) -> oldValue + newValue);
                            //add spurious sequence to spurious cell
                            spuriousCell[sIndex] = spurious.toString();
                        }
                        else {
                            sequencesWithReadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
                        }
                    }
                    for(String seq : sequencesWithReadCounts.keySet()) {
                        occupancyMap.merge(seq, 1, (oldValue, newValue) -> oldValue + newValue);
                        readCountMap.merge(seq, sequencesWithReadCounts.get(seq), (oldValue, newValue) -> oldValue + newValue);
                    }
                }
            }
            if (readError) { //only add a new spurious cell if there was a read error
                spuriousCells.add(spuriousCell);
            }
        }
-        //add all spurious cells to the well
+        System.out.println("First: " + first.toString());
-        well.addAll(spuriousCells);
+        System.out.println("Second: " + second.toString());
        System.out.println("Third: " + third.toString());
        System.out.println("Fourth: " + fourth.toString());
        System.out.println(sequenceMap.size());
        return sequenceMap;
    }
 //    //returns a map of the counts of the sequence at cell index sIndex, in all wells
 //    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
 //    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
 //                                                 int readDepth, double readErrorProb, double errorCollisionProb, int... sIndices) {
 //        this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, 0, size, sIndices);
 //    }
 //    //returns a map of the counts of the sequence at cell index sIndex, in a specific of wells
 //    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
 //    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
 //                                                 int readDepth, double readErrorProb, double errorCollisionProb,
 //                                                 int n, int... sIndices) {
 //       this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, n, n+1, sIndices);
 //    }
 //
 //    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
 //    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
 //    public void assayWellsSequenceSWithReadDepth(Map<String, Integer> misreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
 //                                                 int readDepth, double readErrorProb, double errorCollisionProb,
 //                                                 int start, int end, int... sIndices) {
 //        for(int sIndex: sIndices){
 //            for(int i = start; i < end; i++){
 //                countSequencesWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, wells.get(i), sIndex);
 //            }
 //        }
 //    }
 //
 //    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
 //    //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map.
 //    //NOTE: this function changes the content of the well, adding spurious cells to contain the misread sequences
 //    //(this is necessary because, in the simulation, the plate is read multiple times, but random misreads can only
 //    //be simulated once).
 //    //(Possibly I should refactor all of this to only require a single plate assay, to speed things up. Or at least
 //    //to see if it would speed things up.)
 //    private void countSequencesWithReadDepth(Map<String, Integer> distinctMisreadCounts, Map<String, Integer> occupancyMap, Map<String, Integer> readCountMap,
 //                                             int readDepth, double readErrorProb, double errorCollisionProb,
 //                                             List<String[]> well, int... sIndices) {
 //        //list of spurious cells to add to well after counting
 //        List<String[]> spuriousCells = new ArrayList<>();
 //        for(String[] cell : well) {
 //            //new potential spurious cell for each cell that gets read
 //            String[] spuriousCell = new String[SequenceType.values().length];
 //            //initialize spurious cell with all dropout sequences
 //            Arrays.fill(spuriousCell, "-1");
 //            //has a read error occurred?
 //            boolean readError = false;
 //            for(int sIndex: sIndices){
 //                //skip dropout sequences, which have value "-1"
 //                if(!"-1".equals(cell[sIndex])){
 //                    Map<String, Integer> sequencesWithReadCounts = new LinkedHashMap<>();
 //                    for(int i = 0; i < readDepth; i++) {
 //                        if (rand.nextDouble() <= readErrorProb) {
 //                            readError = true;
 //                            //Read errors are represented by appending "*"s to the end of the sequence some number of times
 //                            StringBuilder spurious = new StringBuilder(cell[sIndex]);
 //                            //if this sequence hasn't been misread before, or the read error is unique,
 //                            //append one more "*" than has been appended before
 //                            if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionProb) {
 //                                distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
 //                                for (int j = 0; j < distinctMisreadCounts.get(cell[sIndex]); j++) {
 //                                    spurious.append("*");
 //                                }
 //                            }
 //                            //if this is a read error collision, randomly choose a number of "*"s that has been appended before
 //                            else {
 //                                int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex]));
 //                                for (int j = 0; j < starCount; j++) {
 //                                    spurious.append("*");
 //                                }
 //                            }
 //                            sequencesWithReadCounts.merge(spurious.toString(), 1, (oldValue, newValue) -> oldValue + newValue);
 //                            //add spurious sequence to spurious cell
 //                            spuriousCell[sIndex] = spurious.toString();
 //                        }
 //                        else {
 //                            sequencesWithReadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
 //                        }
 //                    }
 //                    for(String seq : sequencesWithReadCounts.keySet()) {
 //                        occupancyMap.merge(seq, 1, (oldValue, newValue) -> oldValue + newValue);
 //                        readCountMap.merge(seq, sequencesWithReadCounts.get(seq), (oldValue, newValue) -> oldValue + newValue);
 //                    }
 //                }
 //            }
 //            if (readError) { //only add a new spurious cell if there was a read error
 //                spuriousCells.add(spuriousCell);
 //            }
 //        }
 //        //add all spurious cells to the well
 //        well.addAll(spuriousCells);
 //    }
    public String getSourceFileName() {
        return sourceFile;
    }
--- a/src/main/java/SequenceRecord.java
+++ b/src/main/java/SequenceRecord.java
@@ -0,0 +1,62 @@
 /*
 Class to represent individual sequences, holding their well occupancy and read count information.
 Will make a map of these keyed to the sequences themselves.
 Ideally, I'll be able to construct both the Vertices and the weights matrix from this map.
 */
 import java.util.*;
 public class SequenceRecord {
    private final String sequence;
    private final SequenceType type;
    //keys are well numbers, values are read count in that well
    private final Map<Integer, Integer> wells;
    public SequenceRecord (String sequence, SequenceType type) {
        this.sequence = sequence;
        this.type = type;
        this.wells = new LinkedHashMap<>();
    }
    //this shouldn't be necessary, since the sequence will be the map key, but
    public String getSequence() {
        return sequence;
    }
    public SequenceType getSequenceType(){
        return type;
    }
    //use this to update the record for each new read
    public void addRead(Integer wellNumber) {
        wells.merge(wellNumber,1, Integer::sum);
    }
    //don't know if I'll ever need this
    public void addWellData(Integer wellNumber, Integer readCount) {
        wells.put(wellNumber, readCount);
    }
    public Set<Integer> getWells() {
        return wells.keySet();
    }
    public boolean isInWell(Integer wellNumber) {
        return wells.containsKey(wellNumber);
    }
    public Integer getOccupancy() {
        return wells.size();
    }
    //read count for whole plate
    public Integer getReadCount(){
        return wells.values().stream().mapToInt(Integer::valueOf).sum();
    }
    //read count in a specific well
    public Integer getReadCount(Integer wellNumber) {
        return wells.get(wellNumber);
    }
 }
--- a/src/main/java/Simulator.java
+++ b/src/main/java/Simulator.java
@@ -12,78 +12,68 @@ import java.text.NumberFormat;
 import java.time.Instant;
 import java.time.Duration;
 import java.util.*;
 /*
 Refactor notes
 What would be necessary to do everything with only one scan through the sample plate?
 I would need to keep a list of sequences (real and spurious), and metadata about each sequence.
 I would need the data:
 * # of each well the sequence appears in
 * Read count in that well
 */
 //NOTE: "sequence" in method and variable names refers to a peptide sequence from a simulated T cell
 public class Simulator implements GraphModificationFunctions {
-    //Make the graph needed for matching sequences.
+    public static GraphWithMapData makeCDR3Graph(CellSample cellSample, Plate samplePlate, int readDepth,
-    //sourceVertexIndices and targetVertexIndices are indices within the cell to use as for the two sets of vertices
+                                                 double readErrorRate, double errorCollisionRate, boolean verbose) {
-    //in the bipartite graph. "Source" and "target" are JGraphT terms for the two vertices an edge touches,
+        //start timing
    //even if not directed.
    public static GraphWithMapData makeGraph(CellSample cellSample, Plate samplePlate, int readDepth, double readErrorRate, double errorCollisionRate, boolean verbose) {
        Instant start = Instant.now();
        Map<String, Integer> allAlphas;
        Map<String, Integer> allBetas;
        Map<String, Integer> alphaReadCounts = null;
        Map<String, Integer> betaReadCounts = null;
        Map<String, Integer> misreadCounts;
        List<String[]> distinctCells = cellSample.getCells();
        int[] alphaIndices = {SequenceType.CDR3_ALPHA.ordinal()};
        int[] betaIndices = {SequenceType.CDR3_BETA.ordinal()};
-
+        List<String[]> distinctCells = cellSample.getCells();
        int numWells = samplePlate.getSize();
        //Make a hashmap keyed to alphas, values are associated betas.
        if(verbose){System.out.println("Making cell maps");}
-        //HashMap keyed to Alphas, values Betas
+        Map<String, String> distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells,
-        Map<String, String> distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells, 0, 1);
+                SequenceType.CDR3_ALPHA.ordinal(), SequenceType.CDR3_BETA.ordinal());
        if(verbose){System.out.println("Cell maps made");}
-        if(verbose){System.out.println("Making well maps");}
+        //Make linkedHashMap keyed to sequences, values are SequenceRecords reflecting plate statistics
-        if(readDepth == 1) {
+        if(verbose){System.out.println("Making sample plate sequence maps");}
-            allAlphas = new HashMap<>();
+        Map<String, SequenceRecord> alphaSequences = samplePlate.countSequences(readDepth, readErrorRate,
-            samplePlate.assayWellsSequenceS(allAlphas, alphaIndices);
+                errorCollisionRate, alphaIndices);
-            allBetas = new HashMap<>();
+        int alphaCount = alphaSequences.size();
-            samplePlate.assayWellsSequenceS(allBetas, betaIndices);
+        if(verbose){System.out.println("Alphas sequences read: " + alphaCount);}
-        }
+        Map<String, SequenceRecord> betaSequences = samplePlate.countSequences(readDepth, readErrorRate,
-        else {
+                errorCollisionRate, betaIndices);
-            misreadCounts = new HashMap<>();
+        int betaCount = betaSequences.size();
-            allAlphas = new HashMap<>();
+        if(verbose){System.out.println("Betas sequences read: " + betaCount);}
-            alphaReadCounts = new HashMap<>();
+        if(verbose){System.out.println("Sample plate sequence maps made");}
            samplePlate.assayWellsSequenceSWithReadDepth(misreadCounts, allAlphas, alphaReadCounts, readDepth, readErrorRate, errorCollisionRate, alphaIndices);
            allBetas = new HashMap<>();
            betaReadCounts = new HashMap<>();
            samplePlate.assayWellsSequenceSWithReadDepth(misreadCounts, allBetas, betaReadCounts, readDepth, readErrorRate, errorCollisionRate, betaIndices);
        }
-        int alphaCount = allAlphas.size();
+        //pre-filter saturating sequences and sequences likely to be misreads
-        if(verbose){System.out.println("All alphas count: " + alphaCount);}
+        if(verbose){System.out.println("Removing sequences present in all wells.");}
-        int betaCount = allBetas.size();
+        filterByOccupancyThresholds(alphaSequences, 1, numWells - 1);
-        if(verbose){System.out.println("All betas count: " + betaCount);}
+        filterByOccupancyThresholds(betaSequences, 1, numWells - 1);
-        if(verbose){System.out.println("Well maps made");}
+        if(verbose){System.out.println("Sequences removed");}
-
+        if(verbose){System.out.println("Remaining alpha sequence count: " + alphaSequences.size());}
-        //ideally we wouldn't do any graph pre-filtering. But sequences present in all wells add a huge number of edges to the graph and don't carry any signal value
+        if(verbose){System.out.println("Remaining beta sequence count: " + betaSequences.size());}
-        if (readDepth == 1) {
+        if (readDepth > 1) {
            if(verbose){System.out.println("Removing sequences present in all wells.");}
            filterByOccupancyThresholds(allAlphas, 1, numWells - 1);
            filterByOccupancyThresholds(allBetas, 1, numWells - 1);
            if(verbose){System.out.println("Sequences removed");}
        }
        else {
            if(verbose){System.out.println("Removing sequences present in all wells.");}
            filterByOccupancyThresholds(allAlphas, 1, numWells - 1);
            filterByOccupancyThresholds(allBetas, 1, numWells - 1);
            if(verbose){System.out.println("Sequences removed");}
            if(verbose){System.out.println("Removing sequences with disparate occupancies and read counts");}
-            filterByOccupancyAndReadCount(allAlphas, alphaReadCounts, readDepth);
+            filterByOccupancyAndReadCount(alphaSequences, readDepth);
-            filterByOccupancyAndReadCount(allBetas, betaReadCounts, readDepth);
+            filterByOccupancyAndReadCount(betaSequences, readDepth);
            if(verbose){System.out.println("Sequences removed");}
            if(verbose){System.out.println("Remaining alpha sequence count: " + alphaSequences.size());}
            if(verbose){System.out.println("Remaining beta sequence count: " + betaSequences.size());}
        }
-        int pairableAlphaCount = allAlphas.size();
+        int pairableAlphaCount = alphaSequences.size();
-        if(verbose){System.out.println("Remaining alphas count: " + pairableAlphaCount);}
+        if(verbose){System.out.println("Remaining alpha sequence count: " + pairableAlphaCount);}
-        int pairableBetaCount = allBetas.size();
+        int pairableBetaCount = betaSequences.size();
-        if(verbose){System.out.println("Remaining betas count: " + pairableBetaCount);}
+        if(verbose){System.out.println("Remaining beta sequence count: " + pairableBetaCount);}
        //construct the graph. For simplicity, going to make
        if(verbose){System.out.println("Making vertex maps");}
        //For the SimpleWeightedBipartiteGraphMatrixGenerator, all vertices must have
        //distinct numbers associated with them. Since I'm using a 2D array, that means
@@ -92,50 +82,31 @@ public class Simulator implements GraphModificationFunctions {
        //subtract the vertexStartValue from betas to use their vertex labels as array indices
        int vertexStartValue = 0;
        //keys are sequential integer vertices, values are alphas
-        Map<Integer, String> plateVtoAMap = makeVertexToSequenceMap(allAlphas, vertexStartValue);
+        Map<String, Integer> plateAtoVMap = makeSequenceToVertexMap(alphaSequences, vertexStartValue);
        //new start value for vertex to beta map should be one more than final vertex value in alpha map
-        vertexStartValue += plateVtoAMap.size();
+        vertexStartValue += plateAtoVMap.size();
-        //keys are sequential integers vertices, values are betas
+        //keys are betas, values are sequential integers
-        Map<Integer, String> plateVtoBMap = makeVertexToSequenceMap(allBetas, vertexStartValue);
+        Map<String, Integer> plateBtoVMap = makeSequenceToVertexMap(betaSequences, vertexStartValue);
        //keys are alphas, values are sequential integer vertices from previous map
        Map<String, Integer> plateAtoVMap = invertVertexMap(plateVtoAMap);
        //keys are betas, values are sequential integer vertices from previous map
        Map<String, Integer> plateBtoVMap = invertVertexMap(plateVtoBMap);
        if(verbose){System.out.println("Vertex maps made");}
        //make adjacency matrix for bipartite graph generator
        //(technically this is only 1/4 of an adjacency matrix, but that's all you need
        //for a bipartite graph, and all the SimpleWeightedBipartiteGraphMatrixGenerator class expects.)
-        if(verbose){System.out.println("Creating adjacency matrix");}
+        if(verbose){System.out.println("Making adjacency matrix");}
-        //Count how many wells each alpha sequence appears in
+        double[][] weights = new double[plateAtoVMap.size()][plateBtoVMap.size()];
-        Map<String, Integer> alphaWellCounts = new HashMap<>();
+        fillAdjacencyMatrix(weights, vertexStartValue, alphaSequences, betaSequences, plateAtoVMap, plateBtoVMap);
-        //count how many wells each beta sequence appears in
+        if(verbose){System.out.println("Adjacency matrix made");}
-        Map<String, Integer> betaWellCounts = new HashMap<>();
+        //make bipartite graph
-        //the adjacency matrix to be used by the graph generator
+        if(verbose){System.out.println("Making bipartite weighted graph");}
        double[][] weights = new double[plateVtoAMap.size()][plateVtoBMap.size()];
        countSequencesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap,
                plateBtoVMap, alphaIndices, betaIndices, alphaWellCounts, betaWellCounts, weights);
        if(verbose){System.out.println("Matrix created");}
        //create bipartite graph
        if(verbose){System.out.println("Creating graph");}
        //the graph object
        SimpleWeightedGraph<Vertex, DefaultWeightedEdge> graph =
                new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
        //the graph generator
        SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
        //the list of alpha vertices
        //List<Integer> alphaVertices = new ArrayList<>(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
        List<Vertex> alphaVertices = new ArrayList<>();
        //start with map of all alphas mapped to vertex values, get occupancy from the alphaWellCounts map
        for (String seq : plateAtoVMap.keySet()) {
-            Vertex alphaVertex;
+            Vertex alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaSequences.get(seq).getOccupancy(),
-            if (readDepth == 1) {
+                    plateAtoVMap.get(seq), alphaSequences.get(seq).getReadCount());
                alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaWellCounts.get(seq), plateAtoVMap.get(seq));
            }
            else {
                alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaWellCounts.get(seq), plateAtoVMap.get(seq), alphaReadCounts.get(seq));
            }
            alphaVertices.add(alphaVertex);
        }
        //Sort to make sure the order of vertices in list matches the order of the adjacency matrix
@@ -143,16 +114,10 @@ public class Simulator implements GraphModificationFunctions {
        //Add ordered list of vertices to the graph
        graphGenerator.first(alphaVertices);
        //the list of beta vertices
        //List<Integer> betaVertices = new ArrayList<>(plateVtoBMap.keySet());//This will work because LinkedHashMap preserves order of entry
        List<Vertex> betaVertices = new ArrayList<>();
        for (String seq : plateBtoVMap.keySet()) {
-            Vertex betaVertex;
+            Vertex betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaSequences.get(seq).getOccupancy(),
-            if (readDepth == 1) {
+                    plateBtoVMap.get(seq), betaSequences.get(seq).getReadCount());
                betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaWellCounts.get(seq), plateBtoVMap.get(seq));
            }
            else {
                betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaWellCounts.get(seq), plateBtoVMap.get(seq), betaReadCounts.get(seq));
            }
            betaVertices.add(betaVertex);
        }
        //Sort to make sure the order of vertices in list matches the order of the adjacency matrix
@@ -163,10 +128,9 @@ public class Simulator implements GraphModificationFunctions {
        graphGenerator.weights(weights);
        graphGenerator.generateGraph(graph);
        if(verbose){System.out.println("Graph created");}
-
+        //stop timing
        Instant stop = Instant.now();
        Duration time = Duration.between(start, stop);
        //create GraphWithMapData object
        GraphWithMapData output = new GraphWithMapData(graph, numWells, samplePlate.getPopulations(), distCellsMapAlphaKey,
                alphaCount, betaCount, readDepth, readErrorRate, errorCollisionRate, time);
@@ -690,10 +654,10 @@ public class Simulator implements GraphModificationFunctions {
 //    }
    //Remove sequences based on occupancy
-    public static void filterByOccupancyThresholds(Map<String, Integer> wellMap, int low, int high){
+    public static void filterByOccupancyThresholds(Map<String, SequenceRecord> wellMap, int low, int high){
        List<String> noise = new ArrayList<>();
        for(String k: wellMap.keySet()){
-            if((wellMap.get(k) > high) || (wellMap.get(k) < low)){
+            if((wellMap.get(k).getOccupancy() > high) || (wellMap.get(k).getOccupancy() < low)){
                noise.add(k);
            }
        }
@@ -702,13 +666,12 @@ public class Simulator implements GraphModificationFunctions {
        }
    }
-    public static void filterByOccupancyAndReadCount(Map<String, Integer> sequences,
+    public static void filterByOccupancyAndReadCount(Map<String, SequenceRecord> sequences, int readDepth) {
                                                     Map<String, Integer> sequenceReadCounts, int readDepth) {
        List<String> noise = new ArrayList<>();
        for(String k : sequences.keySet()){
            //occupancy times read depth should be more than half the sequence read count if the read error rate is low
-            Integer threshold = (sequences.get(k) * readDepth) / 2;
+            Integer threshold = (sequences.get(k).getOccupancy() * readDepth) / 2;
-            if(sequenceReadCounts.get(k) < threshold) {
+            if(sequences.get(k).getReadCount() < threshold) {
                noise.add(k);
            }
        }
@@ -717,50 +680,6 @@ public class Simulator implements GraphModificationFunctions {
        }
    }
    //Counts the well occupancy of the row peptides and column peptides into given maps, and
    //fills weights in the given 2D array
    private static void countSequencesAndFillMatrix(Plate samplePlate,
                                                    Map<String, Integer> allRowSequences,
                                                    Map<String, Integer> allColumnSequences,
                                                    Map<String, Integer> rowSequenceToVertexMap,
                                                    Map<String, Integer> columnSequenceToVertexMap,
                                                    int[] rowSequenceIndices,
                                                    int[] colSequenceIndices,
                                                    Map<String, Integer> rowSequenceCounts,
                                                    Map<String, Integer> columnSequenceCounts,
                                                    double[][] weights){
        Map<String, Integer> wellNRowSequences;
        Map<String, Integer> wellNColumnSequences;
        int vertexStartValue = rowSequenceToVertexMap.size();
        int numWells = samplePlate.getSize();
        for (int n = 0; n < numWells; n++) {
            wellNRowSequences = new HashMap<>();
            samplePlate.assayWellsSequenceS(wellNRowSequences, n, rowSequenceIndices);
            for (String a : wellNRowSequences.keySet()) {
                if(allRowSequences.containsKey(a)){
                    rowSequenceCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
            wellNColumnSequences = new HashMap<>();
            samplePlate.assayWellsSequenceS(wellNColumnSequences, n, colSequenceIndices);
            for (String b : wellNColumnSequences.keySet()) {
                if(allColumnSequences.containsKey(b)){
                    columnSequenceCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
            for (String i : wellNRowSequences.keySet()) {
                if(allRowSequences.containsKey(i)){
                    for (String j : wellNColumnSequences.keySet()) {
                        if(allColumnSequences.containsKey(j)){
                            weights[rowSequenceToVertexMap.get(i)][columnSequenceToVertexMap.get(j) - vertexStartValue] += 1.0;
                        }
                    }
                }
            }
        }
    }
    private static Map<String, String> makeSequenceToSequenceMap(List<String[]> cells, int keySequenceIndex,
                                                                 int valueSequenceIndex){
        Map<String, String> keySequenceToValueSequenceMap = new HashMap<>();
@@ -770,9 +689,9 @@ public class Simulator implements GraphModificationFunctions {
        return keySequenceToValueSequenceMap;
    }
-    private static Map<Integer, String> makeVertexToSequenceMap(Map<String, Integer> sequences, Integer startValue) {
+    private static Map<Integer, String> makeVertexToSequenceMap(Map<String, SequenceRecord> sequences, Integer startValue) {
        Map<Integer, String> map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry
-        Integer index = startValue; //is this necessary? I don't think I use this.
+        Integer index = startValue;
        for (String k: sequences.keySet()) {
            map.put(index, k);
            index++;
@@ -780,6 +699,30 @@ public class Simulator implements GraphModificationFunctions {
        return map;
    }
    private static Map<String, Integer> makeSequenceToVertexMap(Map<String, SequenceRecord> sequences, Integer startValue) {
        Map<String, Integer> map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry
        Integer index = startValue;
        for (String k: sequences.keySet()) {
            map.put(k, index);
            index++;
        }
        return map;
    }
    private static void fillAdjacencyMatrix(double[][] weights, Integer vertexOffsetValue, Map<String, SequenceRecord> rowSequences,
                                            Map<String, SequenceRecord> columnSequences, Map<String, Integer> rowToVertexMap,
                                            Map<String, Integer> columnToVertexMap) {
        for (String rowSeq: rowSequences.keySet()) {
            for (Integer well: rowSequences.get(rowSeq).getWells()) {
                for (String colSeq: columnSequences.keySet()) {
                    if (columnSequences.get(colSeq).isInWell(well)) {
                        weights[rowToVertexMap.get(rowSeq)][columnToVertexMap.get(colSeq) - vertexOffsetValue] += 1.0;
                    }
                }
            }
        }
    }
    private static Map<String, Integer> invertVertexMap(Map<Integer, String> map) {
        Map<String, Integer> inverse = new HashMap<>();
        for (Integer k : map.keySet()) {