From 48510a5b60090ce58d747eaa0159390a110739a0 Mon Sep 17 00:00:00 2001 From: efischer Date: Tue, 16 Nov 2021 16:31:35 -0600 Subject: [PATCH] Improved efficiency by pre-filtering peptide lists before making and processing graphs --- src/main/java/Simulator.java | 165 +++++++++++++++++++++++++++++------ 1 file changed, 138 insertions(+), 27 deletions(-) diff --git a/src/main/java/Simulator.java b/src/main/java/Simulator.java index ac54a77..c83cc73 100644 --- a/src/main/java/Simulator.java +++ b/src/main/java/Simulator.java @@ -4,12 +4,13 @@ import org.jgrapht.alg.matching.MaximumWeightBipartiteMatching; import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator; import org.jgrapht.graph.DefaultWeightedEdge; import org.jgrapht.graph.SimpleWeightedGraph; +import org.jgrapht.util.ArrayUnenforcedSet; import java.math.BigDecimal; import java.math.MathContext; import java.text.NumberFormat; -//import java.time.Duration; -//import java.time.Instant; +import java.time.Instant; +import java.time.Duration; import java.util.*; import java.util.stream.IntStream; @@ -74,6 +75,8 @@ public class Simulator { public static MatchingResult matchCDR3s(List distinctCells, Plate samplePlate, Integer lowThreshold, Integer highThreshold){ System.out.println("Cells: " + distinctCells.size()); + Instant start = Instant.now(); + int numWells = samplePlate.getSize(); System.out.println("Making cell maps"); //HashMap keyed to Alphas, values Betas @@ -98,6 +101,34 @@ public class Simulator { System.out.println("Well maps made"); + //Remove saturating-occupancy peptides because they have no signal value. + //Remove below-minimum-overlap-threshold peptides because they can't possibly have an overlap with another + //peptide that's above the threshold. + System.out.println("Removing below-minimum-overlap-threshold and saturating-occupancy peptides"); + List noiseAlphas = new ArrayList<>(); + List noiseBetas = new ArrayList<>(); + for(Integer k: allAlphas.keySet()){ + if((allAlphas.get(k)>numWells - 1) || (allAlphas.get(k) < lowThreshold)){ + noiseAlphas.add(k); + } + } + for(Integer k: allBetas.keySet()){ + if((allBetas.get(k)> numWells - 1 ) || (allBetas.get(k) < lowThreshold)){ + noiseBetas.add(k); + } + } + for(Integer p: noiseAlphas){ + allAlphas.remove(p); + } + for(Integer p: noiseBetas){ + allBetas.remove(p); + } + System.out.println("Peptides removed"); + int pairableAlphaCount = allAlphas.size(); + System.out.println("Remaining alpha count: " + pairableAlphaCount); + int pairableBetaCount = allBetas.size(); + System.out.println("Remaining beta count: " + pairableBetaCount); + System.out.println("Making vertex maps"); //Using Integers instead of Strings to label vertices so I can do clever stuff with indices if I need to // when I refactor to use a 2d array to make the graph @@ -115,7 +146,7 @@ public class Simulator { Map plateBtoVMap = invertVertexMap(plateVtoBMap); System.out.println("Vertex maps made"); - System.out.println("Creating Graph"); + System.out.println("Creating adjacency matrix"); //Count how many wells each alpha appears in Map alphaWellCounts = new HashMap<>(); //count how many wells each beta appears in @@ -130,18 +161,30 @@ public class Simulator { for (int n = 0; n < numWells; n++) { wellNAlphas = samplePlate.assayWellsCDR3Alpha(n); for (Integer a : wellNAlphas.keySet()) { - alphaWellCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue); + if(allAlphas.containsKey(a)){ + alphaWellCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue); + } } wellNBetas = samplePlate.assayWellsCDR3Beta(n); for (Integer b : wellNBetas.keySet()) { - betaWellCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue); + if(allBetas.containsKey(b)){ + betaWellCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue); + } } for (Integer i : wellNAlphas.keySet()) { - for (Integer j : wellNBetas.keySet()) { - weights[plateAtoVMap.get(i)][plateBtoVMap.get(j) - vertexStartValue] += 1.0; + if(allAlphas.containsKey(i)){ + for (Integer j : wellNBetas.keySet()) { + if(allBetas.containsKey(j)){ + weights[plateAtoVMap.get(i)][plateBtoVMap.get(j) - vertexStartValue] += 1.0; + } + } } } } + System.out.println("matrix created"); + + System.out.println("creating graph"); + SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator(); List alphaVertices = new ArrayList<>(); alphaVertices.addAll(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry @@ -153,11 +196,21 @@ public class Simulator { graphGenerator.generateGraph(graph); System.out.println("Graph created"); + System.out.println("Eliminating edges with weights outside threshold values"); + for(DefaultWeightedEdge e: graph.edgeSet()){ + if ((graph.getEdgeWeight(e) > highThreshold) || (graph.getEdgeWeight(e) < lowThreshold)){ + graph.setEdgeWeight(e, 0.0); + } + } + System.out.println("Over- and under-weight edges set to 0.0"); + + System.out.println("Finding maximum weighted matching"); MaximumWeightBipartiteMatching maxWeightMatching = new MaximumWeightBipartiteMatching(graph, plateVtoAMap.keySet(), plateVtoBMap.keySet()); MatchingAlgorithm.Matching graphMatching = maxWeightMatching.getMatching(); System.out.println("Matching completed"); + Instant stop = Instant.now(); //Header for CSV file List header = new ArrayList<>(); @@ -182,9 +235,6 @@ public class Simulator { Map matchMap = new HashMap<>(); while(weightIter.hasNext()) { e = weightIter.next(); - if(graph.getEdgeWeight(e) < lowThreshold || graph.getEdgeWeight(e) > highThreshold) { - continue; - } Integer source = graph.getEdgeSource(e); Integer target = graph.getEdgeTarget(e); //The match map is all matches found, not just true matches! @@ -223,10 +273,15 @@ public class Simulator { List comments = new ArrayList<>(); comments.add("Total alphas found: " + alphaCount); comments.add("Total betas found: " + betaCount); + comments.add("High overlap threshold: " + highThreshold); + comments.add("Low overlap threshold: " + lowThreshold); comments.add("Pairing attempt rate: " + attemptRateTrunc); comments.add("Correct pairings: " + trueCount); comments.add("Incorrect pairings: " + falseCount); comments.add("Pairing error rate: " + pairingErrorRateTrunc); + Duration time = Duration.between(start, stop); + comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds"); + System.out.println("Simulation time: " + nf.format(time.toSeconds()) + " seconds"); return new MatchingResult(comments, header, allResults, matchMap); } @@ -234,6 +289,8 @@ public class Simulator { public static MatchingResult[] matchCDR1s(List distinctCells, Plate samplePlate, Integer lowThreshold, Integer highThreshold, Map previousMatches){ + Instant start = Instant.now(); + int numWells = samplePlate.getSize(); System.out.println("Making previous match maps"); Map CDR3AtoBMap = previousMatches; @@ -261,6 +318,34 @@ public class Simulator { System.out.println("all CDR1s count: " + CDR1Count); System.out.println("Well maps made"); + //NEW FILTERING CODE TO TEST + System.out.println("Removing unpaired CDR3s from well maps"); + List unpairedCDR3s = new ArrayList<>(); + for(Integer i: allCDR3s.keySet()){ + if(!(CDR3AtoBMap.containsKey(i) || CDR3BtoAMap.containsKey(i))){ + unpairedCDR3s.add(i); + } + } + for(Integer i: unpairedCDR3s){ + allCDR3s.remove(i); + } + System.out.println("Unpaired CDR3s removed."); + System.out.println("Remaining CDR3 count: " + allCDR3s.size()); + + System.out.println("Removing below-minimum-overlap-threshold and saturating-occupancy CDR1s"); + List noiseCDR1s = new ArrayList<>(); + for(Integer i: allCDR1s.keySet()){ + if((allCDR1s.get(i) < lowThreshold) || (allCDR1s.get(i)) > numWells - 1){ + noiseCDR1s.add(i); + } + } + for(Integer i: noiseCDR1s){ + allCDR1s.remove(i); + } + System.out.println("CDR1s removed."); + System.out.println("Remaining CDR1 count: " + allCDR1s.size()); + //END NEW CODE + System.out.println("Making vertex maps"); //Using Integers instead of Strings to label vertices so I can do clever stuff with indices if I need to // when I refactor to use a 2d array to make the graph @@ -300,9 +385,11 @@ public class Simulator { CDR1WellCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue); } for (Integer i : wellNCDR3s.keySet()) { - if(CDR3AtoBMap.containsKey(i)||CDR3BtoAMap.containsKey(i)){//only consider ones we have matches for + if(allCDR3s.containsKey(i)){//only consider CDR3s we have matches for - rest filtered out for (Integer j : wellNCDR1s.keySet()) { - weights[plateCDR3toVMap.get(i)][plateCDR1toVMap.get(j) - vertexStartValue] += 1.0; + if(allCDR1s.containsKey(j)){ //only those CDR1s we didn't filter out + weights[plateCDR3toVMap.get(i)][plateCDR1toVMap.get(j) - vertexStartValue] += 1.0; + } } } } @@ -319,6 +406,14 @@ public class Simulator { System.out.println("Graph created"); System.out.println("Number of edges: " + graph.edgeSet().size()); + System.out.println("Removing edges outside of weight thresholds"); + for(DefaultWeightedEdge e: graph.edgeSet()){ + if ((graph.getEdgeWeight(e) > highThreshold) || (graph.getEdgeWeight(e) < lowThreshold)){ + graph.setEdgeWeight(e, 0.0); + } + } + System.out.println("Over- and under-weight edges set to 0.0"); + System.out.println("Finding first maximum weighted matching"); MaximumWeightBipartiteMatching firstMaxWeightMatching = new MaximumWeightBipartiteMatching(graph, plateVtoCDR3Map.keySet(), plateVtoCDR1Map.keySet()); @@ -411,17 +506,22 @@ public class Simulator { } } - //results for dual map + Instant stop = Instant.now(); + + + //results for first map + System.out.println("Results for first pass"); List> allResults = new ArrayList<>(); Integer trueCount = 0; - Iterator iter = dualMatchesMap.keySet().iterator(); + Iterator iter = firstMatchesMap.keySet().iterator(); + while(iter.hasNext()){ Boolean proven = false; List tmp = new ArrayList<>(); tmp.add(iter.next().toString()); tmp.add(iter.next().toString()); - tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(0))).toString()); - tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(1))).toString()); + tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(0))).toString()); + tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(1))).toString()); if(alphaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(0))).equals(Integer.valueOf(tmp.get(2)))){ if(betaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(1))).equals(Integer.valueOf(tmp.get(3)))){ proven = true; @@ -449,6 +549,10 @@ public class Simulator { double correctRate = (double) trueCount / allResults.size(); comments.add("Correct matching rate: " + correctRate); + for(String s: comments){ + System.out.println(s); + } + List headers = new ArrayList<>(); headers.add("CDR3 alpha"); headers.add("CDR3 beta"); @@ -456,24 +560,20 @@ public class Simulator { headers.add("second matched CDR1"); headers.add("Correct match?"); - for(String s: comments){ - System.out.println(s); - } + MatchingResult firstTest = new MatchingResult(comments, headers, allResults, dualMatchesMap); - MatchingResult dualTest = new MatchingResult(comments, headers, allResults, dualMatchesMap); - - - //results for first map + //results for dual map + System.out.println("Results for second pass"); allResults = new ArrayList<>(); trueCount = 0; - iter = firstMatchesMap.keySet().iterator(); + iter = dualMatchesMap.keySet().iterator(); while(iter.hasNext()){ Boolean proven = false; List tmp = new ArrayList<>(); tmp.add(iter.next().toString()); tmp.add(iter.next().toString()); - tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(0))).toString()); - tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(1))).toString()); + tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(0))).toString()); + tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(1))).toString()); if(alphaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(0))).equals(Integer.valueOf(tmp.get(2)))){ if(betaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(1))).equals(Integer.valueOf(tmp.get(3)))){ proven = true; @@ -494,6 +594,8 @@ public class Simulator { comments = new ArrayList<>(); comments.add("Plate size: " + samplePlate.getSize() + " wells"); comments.add("Previous pairs found: " + previousMatches.size()); + comments.add("High overlap threshold: " + highThreshold); + comments.add("Low overlap threshold: " + lowThreshold); comments.add("CDR1 matches attempted: " + allResults.size()); attemptRate = (double) allResults.size() / previousMatches.size(); comments.add("Matching attempt rate: " + attemptRate); @@ -501,10 +603,19 @@ public class Simulator { correctRate = (double) trueCount / allResults.size(); comments.add("Correct matching rate: " + correctRate); + + for(String s: comments){ System.out.println(s); } - MatchingResult firstTest = new MatchingResult(comments, headers, allResults, dualMatchesMap); + NumberFormat nf = NumberFormat.getInstance(Locale.US); + Duration time = Duration.between(start, stop); + comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds"); + System.out.println("Simulation time: " + nf.format(time.toSeconds()) + " seconds"); + + MatchingResult dualTest = new MatchingResult(comments, headers, allResults, dualMatchesMap); + + MatchingResult[] output = {firstTest, dualTest}; return output; }