From 88b6c79caa600d909f0f04278f59403c04e59fd1 Mon Sep 17 00:00:00 2001 From: eugenefischer <66030419+eugenefischer@users.noreply.github.com> Date: Wed, 28 Sep 2022 00:07:59 -0500 Subject: [PATCH] Refactor to simplify graph creation code --- src/main/java/CommandLineInterface.java | 2 +- src/main/java/InteractiveInterface.java | 2 +- src/main/java/Plate.java | 283 +++++++++++++++--------- src/main/java/SequenceRecord.java | 62 ++++++ src/main/java/Simulator.java | 237 ++++++++------------ 5 files changed, 330 insertions(+), 256 deletions(-) create mode 100644 src/main/java/SequenceRecord.java diff --git a/src/main/java/CommandLineInterface.java b/src/main/java/CommandLineInterface.java index e3d73ee..9b7580b 100644 --- a/src/main/java/CommandLineInterface.java +++ b/src/main/java/CommandLineInterface.java @@ -159,7 +159,7 @@ public class CommandLineInterface { if (line.hasOption("coll")) { errorCollisionRate = Double.parseDouble(line.getOptionValue("coll")); } - graph = Simulator.makeGraph(cells, plate, readDepth, readErrorRate, errorCollisionRate, false); + graph = Simulator.makeCDR3Graph(cells, plate, readDepth, readErrorRate, errorCollisionRate, false); if (!line.hasOption("no-binary")) { //output binary file unless told not to GraphDataObjectWriter writer = new GraphDataObjectWriter(outputFilename, graph, false); writer.writeDataToFile(); diff --git a/src/main/java/InteractiveInterface.java b/src/main/java/InteractiveInterface.java index 45c307a..f367a0a 100644 --- a/src/main/java/InteractiveInterface.java +++ b/src/main/java/InteractiveInterface.java @@ -338,7 +338,7 @@ public class InteractiveInterface { System.out.println("Returning to main menu."); } else{ - GraphWithMapData data = Simulator.makeGraph(cellSample, plate, readDepth, readErrorRate, errorCollisionRate, true); + GraphWithMapData data = Simulator.makeCDR3Graph(cellSample, plate, readDepth, readErrorRate, errorCollisionRate, true); assert filename != null; if(BiGpairSEQ.outputBinary()) { GraphDataObjectWriter dataWriter = new GraphDataObjectWriter(filename, data); diff --git a/src/main/java/Plate.java b/src/main/java/Plate.java index 68ebe8c..0e8704b 100644 --- a/src/main/java/Plate.java +++ b/src/main/java/Plate.java @@ -148,127 +148,196 @@ public class Plate { return wells; } - //returns a map of the counts of the sequence at cell index sIndex, in all wells - public void assayWellsSequenceS(Map sequences, int... sIndices){ - this.assayWellsSequenceS(sequences, 0, size, sIndices); - } +// //returns a map of the counts of the sequence at cell index sIndex, in all wells +// public void assayWellsSequenceS(Map sequences, int... sIndices){ +// this.assayWellsSequenceS(sequences, 0, size, sIndices); +// } +// +// //returns a map of the counts of the sequence at cell index sIndex, in a specific well +// public void assayWellsSequenceS(Map sequences, int n, int... sIndices) { +// this.assayWellsSequenceS(sequences, n, n+1, sIndices); +// } +// +// //returns a map of the counts of the sequence at cell index sIndex, in a range of wells +// public void assayWellsSequenceS(Map sequences, int start, int end, int... sIndices) { +// for(int sIndex: sIndices){ +// for(int i = start; i < end; i++){ +// countSequences(sequences, wells.get(i), sIndex); +// } +// } +// } +// //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map +// private void countSequences(Map wellMap, List well, int... sIndices) { +// for(String[] cell : well) { +// for(int sIndex: sIndices){ +// //skip dropout sequences, which have value -1 +// if(!"-1".equals(cell[sIndex])){ +// wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); +// } +// } +// } +// } - //returns a map of the counts of the sequence at cell index sIndex, in a specific well - public void assayWellsSequenceS(Map sequences, int n, int... sIndices) { - this.assayWellsSequenceS(sequences, n, n+1, sIndices); - } + //For the sequences at cell indices sIndices, counts number of unique sequences in all well into the given map + public Map countSequences(Integer readDepth, Double readErrorRate, + Double errorCollisionRate, int... sIndices) { + SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]); + Map distinctMisreadCounts = new HashMap<>(); + Map sequenceMap = new LinkedHashMap<>(); + //booleans for testing + Boolean first= false; + Boolean second = false; + Boolean third = false; + Boolean fourth = false; - //returns a map of the counts of the sequence at cell index sIndex, in a range of wells - public void assayWellsSequenceS(Map sequences, int start, int end, int... sIndices) { - for(int sIndex: sIndices){ - for(int i = start; i < end; i++){ - countSequences(sequences, wells.get(i), sIndex); - } - } - } - //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map - private void countSequences(Map wellMap, List well, int... sIndices) { - for(String[] cell : well) { - for(int sIndex: sIndices){ - //skip dropout sequences, which have value -1 - if(!"-1".equals(cell[sIndex])){ - wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); - } - } - } - } - - //returns a map of the counts of the sequence at cell index sIndex, in all wells - //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. - public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, - int readDepth, double readErrorProb, double errorCollisionProb, int... sIndices) { - this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, 0, size, sIndices); - } - //returns a map of the counts of the sequence at cell index sIndex, in a specific of wells - //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. - public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, - int readDepth, double readErrorProb, double errorCollisionProb, - int n, int... sIndices) { - this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, n, n+1, sIndices); - } - - //returns a map of the counts of the sequence at cell index sIndex, in a range of wells - //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. - public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, - int readDepth, double readErrorProb, double errorCollisionProb, - int start, int end, int... sIndices) { - for(int sIndex: sIndices){ - for(int i = start; i < end; i++){ - countSequencesWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, wells.get(i), sIndex); - } - } - } - - //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map - //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. - //NOTE: this function changes the content of the well, adding spurious cells to contain the misread sequences - //(this is necessary because, in the simulation, the plate is read multiple times, but random misreads can only - //be simulated once). - //(Possibly I should refactor all of this to only require a single plate assay, to speed things up. Or at least - //to see if it would speed things up.) - private void countSequencesWithReadDepth(Map distinctMisreadCounts, Map occupancyMap, Map readCountMap, - int readDepth, double readErrorProb, double errorCollisionProb, - List well, int... sIndices) { - //list of spurious cells to add to well after counting - List spuriousCells = new ArrayList<>(); - for(String[] cell : well) { - //new potential spurious cell for each cell that gets read - String[] spuriousCell = new String[SequenceType.values().length]; - //initialize spurious cell with all dropout sequences - Arrays.fill(spuriousCell, "-1"); - //has a read error occurred? - boolean readError = false; - for(int sIndex: sIndices){ - //skip dropout sequences, which have value "-1" - if(!"-1".equals(cell[sIndex])){ - Map sequencesWithReadCounts = new LinkedHashMap<>(); - for(int i = 0; i < readDepth; i++) { - if (rand.nextDouble() <= readErrorProb) { - readError = true; - //Read errors are represented by appending "*"s to the end of the sequence some number of times - StringBuilder spurious = new StringBuilder(cell[sIndex]); - //if this sequence hasn't been misread before, or the read error is unique, - //append one more "*" than has been appended before - if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionProb) { - distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); - for (int j = 0; j < distinctMisreadCounts.get(cell[sIndex]); j++) { - spurious.append("*"); + for (int well = 0; well < size; well++) { + first = true; + for (String[] cell : wells.get(well)) { + second = true; + for (int sIndex : sIndices) { + third = true; + //skip dropout sequences, which have value -1 + if (!"-1".equals(cell[sIndex])) { + for (int j = 0; j < readDepth; j++) { + //Misread sequence + if (rand.nextDouble() < readErrorRate) { + StringBuilder spurious = new StringBuilder(cell[sIndex]); + //if this sequence hasn't been misread before, or the read error is unique, + //append one more "*" than has been appended before + if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionRate) { + distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); + for (int k = 0; k < distinctMisreadCounts.get(cell[sIndex]); k++) { + spurious.append("*"); + } + SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]); + tmp.addRead(well); + sequenceMap.put(spurious.toString(), tmp); + } + //if this is a read error collision, randomly choose a number of "*"s that has been appended before + else { + int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex])); + for (int k = 0; k < starCount; k++) { + spurious.append("*"); + } + sequenceMap.get(spurious.toString()).addRead(well); } } - //if this is a read error collision, randomly choose a number of "*"s that has been appended before + //sequence is read correctly else { - int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex])); - for (int j = 0; j < starCount; j++) { - spurious.append("*"); + fourth = true; + if (!sequenceMap.containsKey(cell[sIndex])) { + SequenceRecord tmp = new SequenceRecord(cell[sIndex], sequenceTypes[sIndex]); + tmp.addRead(well); + sequenceMap.put(cell[sIndex], tmp); + } else { + sequenceMap.get(cell[sIndex]).addRead(well); } } - sequencesWithReadCounts.merge(spurious.toString(), 1, (oldValue, newValue) -> oldValue + newValue); - //add spurious sequence to spurious cell - spuriousCell[sIndex] = spurious.toString(); } - else { - sequencesWithReadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); - } - } - for(String seq : sequencesWithReadCounts.keySet()) { - occupancyMap.merge(seq, 1, (oldValue, newValue) -> oldValue + newValue); - readCountMap.merge(seq, sequencesWithReadCounts.get(seq), (oldValue, newValue) -> oldValue + newValue); } } } - if (readError) { //only add a new spurious cell if there was a read error - spuriousCells.add(spuriousCell); - } } - //add all spurious cells to the well - well.addAll(spuriousCells); + System.out.println("First: " + first.toString()); + System.out.println("Second: " + second.toString()); + System.out.println("Third: " + third.toString()); + System.out.println("Fourth: " + fourth.toString()); + System.out.println(sequenceMap.size()); + return sequenceMap; } + +// //returns a map of the counts of the sequence at cell index sIndex, in all wells +// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. +// public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, +// int readDepth, double readErrorProb, double errorCollisionProb, int... sIndices) { +// this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, 0, size, sIndices); +// } +// //returns a map of the counts of the sequence at cell index sIndex, in a specific of wells +// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. +// public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, +// int readDepth, double readErrorProb, double errorCollisionProb, +// int n, int... sIndices) { +// this.assayWellsSequenceSWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, n, n+1, sIndices); +// } +// +// //returns a map of the counts of the sequence at cell index sIndex, in a range of wells +// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. +// public void assayWellsSequenceSWithReadDepth(Map misreadCounts, Map occupancyMap, Map readCountMap, +// int readDepth, double readErrorProb, double errorCollisionProb, +// int start, int end, int... sIndices) { +// for(int sIndex: sIndices){ +// for(int i = start; i < end; i++){ +// countSequencesWithReadDepth(misreadCounts, occupancyMap, readCountMap, readDepth, readErrorProb, errorCollisionProb, wells.get(i), sIndex); +// } +// } +// } +// +// //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map +// //Simulates read depth and read errors, counts the number of reads of a unique sequence into the given map. +// //NOTE: this function changes the content of the well, adding spurious cells to contain the misread sequences +// //(this is necessary because, in the simulation, the plate is read multiple times, but random misreads can only +// //be simulated once). +// //(Possibly I should refactor all of this to only require a single plate assay, to speed things up. Or at least +// //to see if it would speed things up.) +// private void countSequencesWithReadDepth(Map distinctMisreadCounts, Map occupancyMap, Map readCountMap, +// int readDepth, double readErrorProb, double errorCollisionProb, +// List well, int... sIndices) { +// //list of spurious cells to add to well after counting +// List spuriousCells = new ArrayList<>(); +// for(String[] cell : well) { +// //new potential spurious cell for each cell that gets read +// String[] spuriousCell = new String[SequenceType.values().length]; +// //initialize spurious cell with all dropout sequences +// Arrays.fill(spuriousCell, "-1"); +// //has a read error occurred? +// boolean readError = false; +// for(int sIndex: sIndices){ +// //skip dropout sequences, which have value "-1" +// if(!"-1".equals(cell[sIndex])){ +// Map sequencesWithReadCounts = new LinkedHashMap<>(); +// for(int i = 0; i < readDepth; i++) { +// if (rand.nextDouble() <= readErrorProb) { +// readError = true; +// //Read errors are represented by appending "*"s to the end of the sequence some number of times +// StringBuilder spurious = new StringBuilder(cell[sIndex]); +// //if this sequence hasn't been misread before, or the read error is unique, +// //append one more "*" than has been appended before +// if (!distinctMisreadCounts.containsKey(cell[sIndex]) || rand.nextDouble() > errorCollisionProb) { +// distinctMisreadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); +// for (int j = 0; j < distinctMisreadCounts.get(cell[sIndex]); j++) { +// spurious.append("*"); +// } +// } +// //if this is a read error collision, randomly choose a number of "*"s that has been appended before +// else { +// int starCount = rand.nextInt(distinctMisreadCounts.get(cell[sIndex])); +// for (int j = 0; j < starCount; j++) { +// spurious.append("*"); +// } +// } +// sequencesWithReadCounts.merge(spurious.toString(), 1, (oldValue, newValue) -> oldValue + newValue); +// //add spurious sequence to spurious cell +// spuriousCell[sIndex] = spurious.toString(); +// } +// else { +// sequencesWithReadCounts.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue); +// } +// } +// for(String seq : sequencesWithReadCounts.keySet()) { +// occupancyMap.merge(seq, 1, (oldValue, newValue) -> oldValue + newValue); +// readCountMap.merge(seq, sequencesWithReadCounts.get(seq), (oldValue, newValue) -> oldValue + newValue); +// } +// } +// } +// if (readError) { //only add a new spurious cell if there was a read error +// spuriousCells.add(spuriousCell); +// } +// } +// //add all spurious cells to the well +// well.addAll(spuriousCells); +// } + public String getSourceFileName() { return sourceFile; } diff --git a/src/main/java/SequenceRecord.java b/src/main/java/SequenceRecord.java new file mode 100644 index 0000000..1c9371f --- /dev/null +++ b/src/main/java/SequenceRecord.java @@ -0,0 +1,62 @@ +/* +Class to represent individual sequences, holding their well occupancy and read count information. +Will make a map of these keyed to the sequences themselves. +Ideally, I'll be able to construct both the Vertices and the weights matrix from this map. + + */ + +import java.util.*; + +public class SequenceRecord { + private final String sequence; + private final SequenceType type; + //keys are well numbers, values are read count in that well + private final Map wells; + + public SequenceRecord (String sequence, SequenceType type) { + this.sequence = sequence; + this.type = type; + this.wells = new LinkedHashMap<>(); + } + + //this shouldn't be necessary, since the sequence will be the map key, but + public String getSequence() { + return sequence; + } + + public SequenceType getSequenceType(){ + return type; + } + + //use this to update the record for each new read + public void addRead(Integer wellNumber) { + wells.merge(wellNumber,1, Integer::sum); + } + + //don't know if I'll ever need this + public void addWellData(Integer wellNumber, Integer readCount) { + wells.put(wellNumber, readCount); + } + + public Set getWells() { + return wells.keySet(); + } + + public boolean isInWell(Integer wellNumber) { + return wells.containsKey(wellNumber); + } + + public Integer getOccupancy() { + return wells.size(); + } + + //read count for whole plate + public Integer getReadCount(){ + return wells.values().stream().mapToInt(Integer::valueOf).sum(); + } + + //read count in a specific well + public Integer getReadCount(Integer wellNumber) { + return wells.get(wellNumber); + } + } diff --git a/src/main/java/Simulator.java b/src/main/java/Simulator.java index e7a2cfc..f5b61f3 100644 --- a/src/main/java/Simulator.java +++ b/src/main/java/Simulator.java @@ -12,78 +12,68 @@ import java.text.NumberFormat; import java.time.Instant; import java.time.Duration; import java.util.*; +/* + Refactor notes + What would be necessary to do everything with only one scan through the sample plate? + I would need to keep a list of sequences (real and spurious), and metadata about each sequence. + I would need the data: + * # of each well the sequence appears in + * Read count in that well + */ + //NOTE: "sequence" in method and variable names refers to a peptide sequence from a simulated T cell public class Simulator implements GraphModificationFunctions { - //Make the graph needed for matching sequences. - //sourceVertexIndices and targetVertexIndices are indices within the cell to use as for the two sets of vertices - //in the bipartite graph. "Source" and "target" are JGraphT terms for the two vertices an edge touches, - //even if not directed. - public static GraphWithMapData makeGraph(CellSample cellSample, Plate samplePlate, int readDepth, double readErrorRate, double errorCollisionRate, boolean verbose) { + public static GraphWithMapData makeCDR3Graph(CellSample cellSample, Plate samplePlate, int readDepth, + double readErrorRate, double errorCollisionRate, boolean verbose) { + //start timing Instant start = Instant.now(); - Map allAlphas; - Map allBetas; - Map alphaReadCounts = null; - Map betaReadCounts = null; - Map misreadCounts; - List distinctCells = cellSample.getCells(); int[] alphaIndices = {SequenceType.CDR3_ALPHA.ordinal()}; int[] betaIndices = {SequenceType.CDR3_BETA.ordinal()}; - + List distinctCells = cellSample.getCells(); int numWells = samplePlate.getSize(); + //Make a hashmap keyed to alphas, values are associated betas. if(verbose){System.out.println("Making cell maps");} - //HashMap keyed to Alphas, values Betas - Map distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells, 0, 1); + Map distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells, + SequenceType.CDR3_ALPHA.ordinal(), SequenceType.CDR3_BETA.ordinal()); if(verbose){System.out.println("Cell maps made");} - if(verbose){System.out.println("Making well maps");} - if(readDepth == 1) { - allAlphas = new HashMap<>(); - samplePlate.assayWellsSequenceS(allAlphas, alphaIndices); - allBetas = new HashMap<>(); - samplePlate.assayWellsSequenceS(allBetas, betaIndices); - } - else { - misreadCounts = new HashMap<>(); - allAlphas = new HashMap<>(); - alphaReadCounts = new HashMap<>(); - samplePlate.assayWellsSequenceSWithReadDepth(misreadCounts, allAlphas, alphaReadCounts, readDepth, readErrorRate, errorCollisionRate, alphaIndices); - allBetas = new HashMap<>(); - betaReadCounts = new HashMap<>(); - samplePlate.assayWellsSequenceSWithReadDepth(misreadCounts, allBetas, betaReadCounts, readDepth, readErrorRate, errorCollisionRate, betaIndices); - } + //Make linkedHashMap keyed to sequences, values are SequenceRecords reflecting plate statistics + if(verbose){System.out.println("Making sample plate sequence maps");} + Map alphaSequences = samplePlate.countSequences(readDepth, readErrorRate, + errorCollisionRate, alphaIndices); + int alphaCount = alphaSequences.size(); + if(verbose){System.out.println("Alphas sequences read: " + alphaCount);} + Map betaSequences = samplePlate.countSequences(readDepth, readErrorRate, + errorCollisionRate, betaIndices); + int betaCount = betaSequences.size(); + if(verbose){System.out.println("Betas sequences read: " + betaCount);} + if(verbose){System.out.println("Sample plate sequence maps made");} - int alphaCount = allAlphas.size(); - if(verbose){System.out.println("All alphas count: " + alphaCount);} - int betaCount = allBetas.size(); - if(verbose){System.out.println("All betas count: " + betaCount);} - if(verbose){System.out.println("Well maps made");} - - //ideally we wouldn't do any graph pre-filtering. But sequences present in all wells add a huge number of edges to the graph and don't carry any signal value - if (readDepth == 1) { - if(verbose){System.out.println("Removing sequences present in all wells.");} - filterByOccupancyThresholds(allAlphas, 1, numWells - 1); - filterByOccupancyThresholds(allBetas, 1, numWells - 1); - if(verbose){System.out.println("Sequences removed");} - } - else { - if(verbose){System.out.println("Removing sequences present in all wells.");} - filterByOccupancyThresholds(allAlphas, 1, numWells - 1); - filterByOccupancyThresholds(allBetas, 1, numWells - 1); - if(verbose){System.out.println("Sequences removed");} + //pre-filter saturating sequences and sequences likely to be misreads + if(verbose){System.out.println("Removing sequences present in all wells.");} + filterByOccupancyThresholds(alphaSequences, 1, numWells - 1); + filterByOccupancyThresholds(betaSequences, 1, numWells - 1); + if(verbose){System.out.println("Sequences removed");} + if(verbose){System.out.println("Remaining alpha sequence count: " + alphaSequences.size());} + if(verbose){System.out.println("Remaining beta sequence count: " + betaSequences.size());} + if (readDepth > 1) { if(verbose){System.out.println("Removing sequences with disparate occupancies and read counts");} - filterByOccupancyAndReadCount(allAlphas, alphaReadCounts, readDepth); - filterByOccupancyAndReadCount(allBetas, betaReadCounts, readDepth); + filterByOccupancyAndReadCount(alphaSequences, readDepth); + filterByOccupancyAndReadCount(betaSequences, readDepth); if(verbose){System.out.println("Sequences removed");} + if(verbose){System.out.println("Remaining alpha sequence count: " + alphaSequences.size());} + if(verbose){System.out.println("Remaining beta sequence count: " + betaSequences.size());} } - int pairableAlphaCount = allAlphas.size(); - if(verbose){System.out.println("Remaining alphas count: " + pairableAlphaCount);} - int pairableBetaCount = allBetas.size(); - if(verbose){System.out.println("Remaining betas count: " + pairableBetaCount);} + int pairableAlphaCount = alphaSequences.size(); + if(verbose){System.out.println("Remaining alpha sequence count: " + pairableAlphaCount);} + int pairableBetaCount = betaSequences.size(); + if(verbose){System.out.println("Remaining beta sequence count: " + pairableBetaCount);} + //construct the graph. For simplicity, going to make if(verbose){System.out.println("Making vertex maps");} //For the SimpleWeightedBipartiteGraphMatrixGenerator, all vertices must have //distinct numbers associated with them. Since I'm using a 2D array, that means @@ -92,50 +82,31 @@ public class Simulator implements GraphModificationFunctions { //subtract the vertexStartValue from betas to use their vertex labels as array indices int vertexStartValue = 0; //keys are sequential integer vertices, values are alphas - Map plateVtoAMap = makeVertexToSequenceMap(allAlphas, vertexStartValue); + Map plateAtoVMap = makeSequenceToVertexMap(alphaSequences, vertexStartValue); //new start value for vertex to beta map should be one more than final vertex value in alpha map - vertexStartValue += plateVtoAMap.size(); - //keys are sequential integers vertices, values are betas - Map plateVtoBMap = makeVertexToSequenceMap(allBetas, vertexStartValue); - //keys are alphas, values are sequential integer vertices from previous map - Map plateAtoVMap = invertVertexMap(plateVtoAMap); - //keys are betas, values are sequential integer vertices from previous map - Map plateBtoVMap = invertVertexMap(plateVtoBMap); + vertexStartValue += plateAtoVMap.size(); + //keys are betas, values are sequential integers + Map plateBtoVMap = makeSequenceToVertexMap(betaSequences, vertexStartValue); if(verbose){System.out.println("Vertex maps made");} - //make adjacency matrix for bipartite graph generator //(technically this is only 1/4 of an adjacency matrix, but that's all you need //for a bipartite graph, and all the SimpleWeightedBipartiteGraphMatrixGenerator class expects.) - if(verbose){System.out.println("Creating adjacency matrix");} - //Count how many wells each alpha sequence appears in - Map alphaWellCounts = new HashMap<>(); - //count how many wells each beta sequence appears in - Map betaWellCounts = new HashMap<>(); - //the adjacency matrix to be used by the graph generator - double[][] weights = new double[plateVtoAMap.size()][plateVtoBMap.size()]; - countSequencesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap, - plateBtoVMap, alphaIndices, betaIndices, alphaWellCounts, betaWellCounts, weights); - if(verbose){System.out.println("Matrix created");} - - //create bipartite graph - if(verbose){System.out.println("Creating graph");} + if(verbose){System.out.println("Making adjacency matrix");} + double[][] weights = new double[plateAtoVMap.size()][plateBtoVMap.size()]; + fillAdjacencyMatrix(weights, vertexStartValue, alphaSequences, betaSequences, plateAtoVMap, plateBtoVMap); + if(verbose){System.out.println("Adjacency matrix made");} + //make bipartite graph + if(verbose){System.out.println("Making bipartite weighted graph");} //the graph object SimpleWeightedGraph graph = new SimpleWeightedGraph<>(DefaultWeightedEdge.class); //the graph generator SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator(); //the list of alpha vertices - //List alphaVertices = new ArrayList<>(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry List alphaVertices = new ArrayList<>(); - //start with map of all alphas mapped to vertex values, get occupancy from the alphaWellCounts map for (String seq : plateAtoVMap.keySet()) { - Vertex alphaVertex; - if (readDepth == 1) { - alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaWellCounts.get(seq), plateAtoVMap.get(seq)); - } - else { - alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaWellCounts.get(seq), plateAtoVMap.get(seq), alphaReadCounts.get(seq)); - } + Vertex alphaVertex = new Vertex(SequenceType.CDR3_ALPHA, seq, alphaSequences.get(seq).getOccupancy(), + plateAtoVMap.get(seq), alphaSequences.get(seq).getReadCount()); alphaVertices.add(alphaVertex); } //Sort to make sure the order of vertices in list matches the order of the adjacency matrix @@ -143,16 +114,10 @@ public class Simulator implements GraphModificationFunctions { //Add ordered list of vertices to the graph graphGenerator.first(alphaVertices); //the list of beta vertices - //List betaVertices = new ArrayList<>(plateVtoBMap.keySet());//This will work because LinkedHashMap preserves order of entry List betaVertices = new ArrayList<>(); for (String seq : plateBtoVMap.keySet()) { - Vertex betaVertex; - if (readDepth == 1) { - betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaWellCounts.get(seq), plateBtoVMap.get(seq)); - } - else { - betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaWellCounts.get(seq), plateBtoVMap.get(seq), betaReadCounts.get(seq)); - } + Vertex betaVertex = new Vertex(SequenceType.CDR3_BETA, seq, betaSequences.get(seq).getOccupancy(), + plateBtoVMap.get(seq), betaSequences.get(seq).getReadCount()); betaVertices.add(betaVertex); } //Sort to make sure the order of vertices in list matches the order of the adjacency matrix @@ -163,10 +128,9 @@ public class Simulator implements GraphModificationFunctions { graphGenerator.weights(weights); graphGenerator.generateGraph(graph); if(verbose){System.out.println("Graph created");} - + //stop timing Instant stop = Instant.now(); Duration time = Duration.between(start, stop); - //create GraphWithMapData object GraphWithMapData output = new GraphWithMapData(graph, numWells, samplePlate.getPopulations(), distCellsMapAlphaKey, alphaCount, betaCount, readDepth, readErrorRate, errorCollisionRate, time); @@ -690,10 +654,10 @@ public class Simulator implements GraphModificationFunctions { // } //Remove sequences based on occupancy - public static void filterByOccupancyThresholds(Map wellMap, int low, int high){ + public static void filterByOccupancyThresholds(Map wellMap, int low, int high){ List noise = new ArrayList<>(); for(String k: wellMap.keySet()){ - if((wellMap.get(k) > high) || (wellMap.get(k) < low)){ + if((wellMap.get(k).getOccupancy() > high) || (wellMap.get(k).getOccupancy() < low)){ noise.add(k); } } @@ -702,13 +666,12 @@ public class Simulator implements GraphModificationFunctions { } } - public static void filterByOccupancyAndReadCount(Map sequences, - Map sequenceReadCounts, int readDepth) { + public static void filterByOccupancyAndReadCount(Map sequences, int readDepth) { List noise = new ArrayList<>(); for(String k : sequences.keySet()){ //occupancy times read depth should be more than half the sequence read count if the read error rate is low - Integer threshold = (sequences.get(k) * readDepth) / 2; - if(sequenceReadCounts.get(k) < threshold) { + Integer threshold = (sequences.get(k).getOccupancy() * readDepth) / 2; + if(sequences.get(k).getReadCount() < threshold) { noise.add(k); } } @@ -717,50 +680,6 @@ public class Simulator implements GraphModificationFunctions { } } - //Counts the well occupancy of the row peptides and column peptides into given maps, and - //fills weights in the given 2D array - private static void countSequencesAndFillMatrix(Plate samplePlate, - Map allRowSequences, - Map allColumnSequences, - Map rowSequenceToVertexMap, - Map columnSequenceToVertexMap, - int[] rowSequenceIndices, - int[] colSequenceIndices, - Map rowSequenceCounts, - Map columnSequenceCounts, - double[][] weights){ - Map wellNRowSequences; - Map wellNColumnSequences; - int vertexStartValue = rowSequenceToVertexMap.size(); - int numWells = samplePlate.getSize(); - for (int n = 0; n < numWells; n++) { - wellNRowSequences = new HashMap<>(); - samplePlate.assayWellsSequenceS(wellNRowSequences, n, rowSequenceIndices); - for (String a : wellNRowSequences.keySet()) { - if(allRowSequences.containsKey(a)){ - rowSequenceCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue); - } - } - wellNColumnSequences = new HashMap<>(); - samplePlate.assayWellsSequenceS(wellNColumnSequences, n, colSequenceIndices); - for (String b : wellNColumnSequences.keySet()) { - if(allColumnSequences.containsKey(b)){ - columnSequenceCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue); - } - } - for (String i : wellNRowSequences.keySet()) { - if(allRowSequences.containsKey(i)){ - for (String j : wellNColumnSequences.keySet()) { - if(allColumnSequences.containsKey(j)){ - weights[rowSequenceToVertexMap.get(i)][columnSequenceToVertexMap.get(j) - vertexStartValue] += 1.0; - } - } - } - } - - } - } - private static Map makeSequenceToSequenceMap(List cells, int keySequenceIndex, int valueSequenceIndex){ Map keySequenceToValueSequenceMap = new HashMap<>(); @@ -770,9 +689,9 @@ public class Simulator implements GraphModificationFunctions { return keySequenceToValueSequenceMap; } - private static Map makeVertexToSequenceMap(Map sequences, Integer startValue) { + private static Map makeVertexToSequenceMap(Map sequences, Integer startValue) { Map map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry - Integer index = startValue; //is this necessary? I don't think I use this. + Integer index = startValue; for (String k: sequences.keySet()) { map.put(index, k); index++; @@ -780,6 +699,30 @@ public class Simulator implements GraphModificationFunctions { return map; } + private static Map makeSequenceToVertexMap(Map sequences, Integer startValue) { + Map map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry + Integer index = startValue; + for (String k: sequences.keySet()) { + map.put(k, index); + index++; + } + return map; + } + + private static void fillAdjacencyMatrix(double[][] weights, Integer vertexOffsetValue, Map rowSequences, + Map columnSequences, Map rowToVertexMap, + Map columnToVertexMap) { + for (String rowSeq: rowSequences.keySet()) { + for (Integer well: rowSequences.get(rowSeq).getWells()) { + for (String colSeq: columnSequences.keySet()) { + if (columnSequences.get(colSeq).isInWell(well)) { + weights[rowToVertexMap.get(rowSeq)][columnToVertexMap.get(colSeq) - vertexOffsetValue] += 1.0; + } + } + } + } + } + private static Map invertVertexMap(Map map) { Map inverse = new HashMap<>(); for (Integer k : map.keySet()) {