Improved efficiency by pre-filtering peptide lists before making and processing graphs

This commit is contained in:
2021-11-16 16:31:35 -06:00
parent 26304e5dff
commit 48510a5b60

View File

@@ -4,12 +4,13 @@ import org.jgrapht.alg.matching.MaximumWeightBipartiteMatching;
import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator;
import org.jgrapht.graph.DefaultWeightedEdge;
import org.jgrapht.graph.SimpleWeightedGraph;
import org.jgrapht.util.ArrayUnenforcedSet;
import java.math.BigDecimal;
import java.math.MathContext;
import java.text.NumberFormat;
//import java.time.Duration;
//import java.time.Instant;
import java.time.Instant;
import java.time.Duration;
import java.util.*;
import java.util.stream.IntStream;
@@ -74,6 +75,8 @@ public class Simulator {
public static MatchingResult matchCDR3s(List<Integer[]> distinctCells,
Plate samplePlate, Integer lowThreshold, Integer highThreshold){
System.out.println("Cells: " + distinctCells.size());
Instant start = Instant.now();
int numWells = samplePlate.getSize();
System.out.println("Making cell maps");
//HashMap keyed to Alphas, values Betas
@@ -98,6 +101,34 @@ public class Simulator {
System.out.println("Well maps made");
//Remove saturating-occupancy peptides because they have no signal value.
//Remove below-minimum-overlap-threshold peptides because they can't possibly have an overlap with another
//peptide that's above the threshold.
System.out.println("Removing below-minimum-overlap-threshold and saturating-occupancy peptides");
List<Integer> noiseAlphas = new ArrayList<>();
List<Integer> noiseBetas = new ArrayList<>();
for(Integer k: allAlphas.keySet()){
if((allAlphas.get(k)>numWells - 1) || (allAlphas.get(k) < lowThreshold)){
noiseAlphas.add(k);
}
}
for(Integer k: allBetas.keySet()){
if((allBetas.get(k)> numWells - 1 ) || (allBetas.get(k) < lowThreshold)){
noiseBetas.add(k);
}
}
for(Integer p: noiseAlphas){
allAlphas.remove(p);
}
for(Integer p: noiseBetas){
allBetas.remove(p);
}
System.out.println("Peptides removed");
int pairableAlphaCount = allAlphas.size();
System.out.println("Remaining alpha count: " + pairableAlphaCount);
int pairableBetaCount = allBetas.size();
System.out.println("Remaining beta count: " + pairableBetaCount);
System.out.println("Making vertex maps");
//Using Integers instead of Strings to label vertices so I can do clever stuff with indices if I need to
// when I refactor to use a 2d array to make the graph
@@ -115,7 +146,7 @@ public class Simulator {
Map<Integer, Integer> plateBtoVMap = invertVertexMap(plateVtoBMap);
System.out.println("Vertex maps made");
System.out.println("Creating Graph");
System.out.println("Creating adjacency matrix");
//Count how many wells each alpha appears in
Map<Integer, Integer> alphaWellCounts = new HashMap<>();
//count how many wells each beta appears in
@@ -130,18 +161,30 @@ public class Simulator {
for (int n = 0; n < numWells; n++) {
wellNAlphas = samplePlate.assayWellsCDR3Alpha(n);
for (Integer a : wellNAlphas.keySet()) {
if(allAlphas.containsKey(a)){
alphaWellCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
}
}
wellNBetas = samplePlate.assayWellsCDR3Beta(n);
for (Integer b : wellNBetas.keySet()) {
if(allBetas.containsKey(b)){
betaWellCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
}
}
for (Integer i : wellNAlphas.keySet()) {
if(allAlphas.containsKey(i)){
for (Integer j : wellNBetas.keySet()) {
if(allBetas.containsKey(j)){
weights[plateAtoVMap.get(i)][plateBtoVMap.get(j) - vertexStartValue] += 1.0;
}
}
}
}
}
System.out.println("matrix created");
System.out.println("creating graph");
SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
List<Integer> alphaVertices = new ArrayList<>();
alphaVertices.addAll(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
@@ -153,11 +196,21 @@ public class Simulator {
graphGenerator.generateGraph(graph);
System.out.println("Graph created");
System.out.println("Eliminating edges with weights outside threshold values");
for(DefaultWeightedEdge e: graph.edgeSet()){
if ((graph.getEdgeWeight(e) > highThreshold) || (graph.getEdgeWeight(e) < lowThreshold)){
graph.setEdgeWeight(e, 0.0);
}
}
System.out.println("Over- and under-weight edges set to 0.0");
System.out.println("Finding maximum weighted matching");
MaximumWeightBipartiteMatching maxWeightMatching =
new MaximumWeightBipartiteMatching(graph, plateVtoAMap.keySet(), plateVtoBMap.keySet());
MatchingAlgorithm.Matching<String, DefaultWeightedEdge> graphMatching = maxWeightMatching.getMatching();
System.out.println("Matching completed");
Instant stop = Instant.now();
//Header for CSV file
List<String> header = new ArrayList<>();
@@ -182,9 +235,6 @@ public class Simulator {
Map<Integer, Integer> matchMap = new HashMap<>();
while(weightIter.hasNext()) {
e = weightIter.next();
if(graph.getEdgeWeight(e) < lowThreshold || graph.getEdgeWeight(e) > highThreshold) {
continue;
}
Integer source = graph.getEdgeSource(e);
Integer target = graph.getEdgeTarget(e);
//The match map is all matches found, not just true matches!
@@ -223,10 +273,15 @@ public class Simulator {
List<String> comments = new ArrayList<>();
comments.add("Total alphas found: " + alphaCount);
comments.add("Total betas found: " + betaCount);
comments.add("High overlap threshold: " + highThreshold);
comments.add("Low overlap threshold: " + lowThreshold);
comments.add("Pairing attempt rate: " + attemptRateTrunc);
comments.add("Correct pairings: " + trueCount);
comments.add("Incorrect pairings: " + falseCount);
comments.add("Pairing error rate: " + pairingErrorRateTrunc);
Duration time = Duration.between(start, stop);
comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
System.out.println("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
return new MatchingResult(comments, header, allResults, matchMap);
}
@@ -234,6 +289,8 @@ public class Simulator {
public static MatchingResult[] matchCDR1s(List<Integer[]> distinctCells,
Plate samplePlate, Integer lowThreshold,
Integer highThreshold, Map<Integer, Integer> previousMatches){
Instant start = Instant.now();
int numWells = samplePlate.getSize();
System.out.println("Making previous match maps");
Map<Integer, Integer> CDR3AtoBMap = previousMatches;
@@ -261,6 +318,34 @@ public class Simulator {
System.out.println("all CDR1s count: " + CDR1Count);
System.out.println("Well maps made");
//NEW FILTERING CODE TO TEST
System.out.println("Removing unpaired CDR3s from well maps");
List<Integer> unpairedCDR3s = new ArrayList<>();
for(Integer i: allCDR3s.keySet()){
if(!(CDR3AtoBMap.containsKey(i) || CDR3BtoAMap.containsKey(i))){
unpairedCDR3s.add(i);
}
}
for(Integer i: unpairedCDR3s){
allCDR3s.remove(i);
}
System.out.println("Unpaired CDR3s removed.");
System.out.println("Remaining CDR3 count: " + allCDR3s.size());
System.out.println("Removing below-minimum-overlap-threshold and saturating-occupancy CDR1s");
List<Integer> noiseCDR1s = new ArrayList<>();
for(Integer i: allCDR1s.keySet()){
if((allCDR1s.get(i) < lowThreshold) || (allCDR1s.get(i)) > numWells - 1){
noiseCDR1s.add(i);
}
}
for(Integer i: noiseCDR1s){
allCDR1s.remove(i);
}
System.out.println("CDR1s removed.");
System.out.println("Remaining CDR1 count: " + allCDR1s.size());
//END NEW CODE
System.out.println("Making vertex maps");
//Using Integers instead of Strings to label vertices so I can do clever stuff with indices if I need to
// when I refactor to use a 2d array to make the graph
@@ -300,13 +385,15 @@ public class Simulator {
CDR1WellCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
}
for (Integer i : wellNCDR3s.keySet()) {
if(CDR3AtoBMap.containsKey(i)||CDR3BtoAMap.containsKey(i)){//only consider ones we have matches for
if(allCDR3s.containsKey(i)){//only consider CDR3s we have matches for - rest filtered out
for (Integer j : wellNCDR1s.keySet()) {
if(allCDR1s.containsKey(j)){ //only those CDR1s we didn't filter out
weights[plateCDR3toVMap.get(i)][plateCDR1toVMap.get(j) - vertexStartValue] += 1.0;
}
}
}
}
}
SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
List<Integer> CDR3Vertices = new ArrayList<>();
CDR3Vertices.addAll(plateVtoCDR3Map.keySet()); //This will work because LinkedHashMap preserves order of entry
@@ -319,6 +406,14 @@ public class Simulator {
System.out.println("Graph created");
System.out.println("Number of edges: " + graph.edgeSet().size());
System.out.println("Removing edges outside of weight thresholds");
for(DefaultWeightedEdge e: graph.edgeSet()){
if ((graph.getEdgeWeight(e) > highThreshold) || (graph.getEdgeWeight(e) < lowThreshold)){
graph.setEdgeWeight(e, 0.0);
}
}
System.out.println("Over- and under-weight edges set to 0.0");
System.out.println("Finding first maximum weighted matching");
MaximumWeightBipartiteMatching firstMaxWeightMatching =
new MaximumWeightBipartiteMatching(graph, plateVtoCDR3Map.keySet(), plateVtoCDR1Map.keySet());
@@ -411,17 +506,22 @@ public class Simulator {
}
}
//results for dual map
Instant stop = Instant.now();
//results for first map
System.out.println("Results for first pass");
List<List<String>> allResults = new ArrayList<>();
Integer trueCount = 0;
Iterator iter = dualMatchesMap.keySet().iterator();
Iterator iter = firstMatchesMap.keySet().iterator();
while(iter.hasNext()){
Boolean proven = false;
List<String> tmp = new ArrayList<>();
tmp.add(iter.next().toString());
tmp.add(iter.next().toString());
tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(0))).toString());
tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(1))).toString());
tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(0))).toString());
tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(1))).toString());
if(alphaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(0))).equals(Integer.valueOf(tmp.get(2)))){
if(betaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(1))).equals(Integer.valueOf(tmp.get(3)))){
proven = true;
@@ -449,6 +549,10 @@ public class Simulator {
double correctRate = (double) trueCount / allResults.size();
comments.add("Correct matching rate: " + correctRate);
for(String s: comments){
System.out.println(s);
}
List<String> headers = new ArrayList<>();
headers.add("CDR3 alpha");
headers.add("CDR3 beta");
@@ -456,24 +560,20 @@ public class Simulator {
headers.add("second matched CDR1");
headers.add("Correct match?");
for(String s: comments){
System.out.println(s);
}
MatchingResult firstTest = new MatchingResult(comments, headers, allResults, dualMatchesMap);
MatchingResult dualTest = new MatchingResult(comments, headers, allResults, dualMatchesMap);
//results for first map
//results for dual map
System.out.println("Results for second pass");
allResults = new ArrayList<>();
trueCount = 0;
iter = firstMatchesMap.keySet().iterator();
iter = dualMatchesMap.keySet().iterator();
while(iter.hasNext()){
Boolean proven = false;
List<String> tmp = new ArrayList<>();
tmp.add(iter.next().toString());
tmp.add(iter.next().toString());
tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(0))).toString());
tmp.add(firstMatchesMap.get(Integer.valueOf(tmp.get(1))).toString());
tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(0))).toString());
tmp.add(dualMatchesMap.get(Integer.valueOf(tmp.get(1))).toString());
if(alphaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(0))).equals(Integer.valueOf(tmp.get(2)))){
if(betaCDR3toCDR1Map.get(Integer.valueOf(tmp.get(1))).equals(Integer.valueOf(tmp.get(3)))){
proven = true;
@@ -494,6 +594,8 @@ public class Simulator {
comments = new ArrayList<>();
comments.add("Plate size: " + samplePlate.getSize() + " wells");
comments.add("Previous pairs found: " + previousMatches.size());
comments.add("High overlap threshold: " + highThreshold);
comments.add("Low overlap threshold: " + lowThreshold);
comments.add("CDR1 matches attempted: " + allResults.size());
attemptRate = (double) allResults.size() / previousMatches.size();
comments.add("Matching attempt rate: " + attemptRate);
@@ -501,10 +603,19 @@ public class Simulator {
correctRate = (double) trueCount / allResults.size();
comments.add("Correct matching rate: " + correctRate);
for(String s: comments){
System.out.println(s);
}
MatchingResult firstTest = new MatchingResult(comments, headers, allResults, dualMatchesMap);
NumberFormat nf = NumberFormat.getInstance(Locale.US);
Duration time = Duration.between(start, stop);
comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
System.out.println("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
MatchingResult dualTest = new MatchingResult(comments, headers, allResults, dualMatchesMap);
MatchingResult[] output = {firstTest, dualTest};
return output;
}