Remove artifacts from tracking to stop repo bloat.

update readme
2022-02-22 16:14:50 -06:00 · 2022-02-22 16:10:22 -06:00 · 2022-02-22 16:09:50 -06:00 · 2022-02-21 11:19:54 -06:00 · 2022-02-21 06:53:30 +00:00 · 2022-02-21 00:29:34 -06:00
18 changed files with 1293 additions and 642 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/out/
--- a/.idea/artifacts/BiGpairSEQ_Sim_jar.xml
+++ b/.idea/artifacts/BiGpairSEQ_Sim_jar.xml
@@ -1,11 +1,11 @@
 <component name="ArtifactManager">
-  <artifact type="jar" name="TCellSim:jar">
-    <output-path>$PROJECT_DIR$/out/artifacts/TCellSim_jar</output-path>
-    <root id="archive" name="TCellSim.jar">
+  <artifact type="jar" build-on-make="true" name="BiGpairSEQ_Sim:jar">
+    <output-path>$PROJECT_DIR$/out/artifacts/BiGpairSEQ_Sim_jar</output-path>
+    <root id="archive" name="BiGpairSEQ_Sim.jar">
      <element id="directory" name="META-INF">
        <element id="file-copy" path="$PROJECT_DIR$/src/main/java/META-INF/MANIFEST.MF" />
      </element>
-      <element id="module-output" name="TCellSim" />
+      <element id="module-output" name="BigPairSEQ" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-core/1.5.1/jgrapht-core-1.5.1.jar" path-in-jar="/" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/org/jheaps/jheaps/0.13/jheaps-0.13.jar" path-in-jar="/" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/commons-cli/commons-cli/1.5.0/commons-cli-1.5.0.jar" path-in-jar="/" />
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -6,7 +6,7 @@
        <sourceOutputDir name="target/generated-sources/annotations" />
        <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
        <outputRelativeToContentRoot value="true" />
-        <module name="TCellSim" />
+        <module name="BigPairSEQ" />
      </profile>
    </annotationProcessing>
  </component>
--- a/.idea/libraries/jgrapht_io.xml
+++ b/.idea/libraries/jgrapht_io.xml
@@ -0,0 +1,15 @@
+<component name="libraryTable">
+  <library name="jgrapht.io" type="repository">
+    <properties maven-id="org.jgrapht:jgrapht-io:1.5.1" />
+    <CLASSES>
+      <root url="jar://$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-io/1.5.1/jgrapht-io-1.5.1.jar!/" />
+      <root url="jar://$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-core/1.5.1/jgrapht-core-1.5.1.jar!/" />
+      <root url="jar://$MAVEN_REPOSITORY$/org/jheaps/jheaps/0.13/jheaps-0.13.jar!/" />
+      <root url="jar://$MAVEN_REPOSITORY$/org/antlr/antlr4-runtime/4.8-1/antlr4-runtime-4.8-1.jar!/" />
+      <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-text/1.8/commons-text-1.8.jar!/" />
+      <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-lang3/3.9/commons-lang3-3.9.jar!/" />
+    </CLASSES>
+    <JAVADOC />
+    <SOURCES />
+  </library>
+</component>
--- a/out/artifacts/TCellSim_jar/TCellSim.jar
+++ b/out/artifacts/TCellSim_jar/TCellSim.jar
--- a/out/artifacts/TCellSim_jar/readme.md
+++ b/out/artifacts/TCellSim_jar/readme.md
@@ -1,4 +0,0 @@
-Executable .jar file with interactive, command line UI
-Usage: java -jar TCellSim.jar
-
-Requires Java11 or higher (Openjdk-17 recommended)
--- a/readme.md
+++ b/readme.md
@@ -0,0 +1,278 @@
+# BiGpairSEQ SIMULATOR
+
+
+## ABOUT
+
+This program simulates BiGpairSEQ (Bipartite Graph pairSEQ), a graph theory-based adaptation
+of the pairSEQ algorithm (Howie, et al. 2015) for pairing T cell receptor sequences.
+
+## THEORY
+
+Unlike pairSEQ, which calculates p-values for every TCR alpha/beta overlap and compares
+against a null distribution, BiGpairSEQ does not do any statistical calculations
+directly.
+
+BiGpairSEQ creates a [simple bipartite weighted graph](https://en.wikipedia.org/wiki/Bipartite_graph) representing the sample plate.
+The distinct TCRA and TCRB sequences form the two sets of vertices. Every TCRA/TCRB pair that share a well
+are connected by an edge, with the edge weight set to the number of wells in which both sequences appear.
+(Sequences present in *all* wells are filtered out prior to creating the graph, as there is no signal in their occupancy pattern.)
+The problem of pairing TCRA/TCRB sequences thus reduces to the "assignment problem" of finding a maximum weight
+matching on a bipartite graph--the subset of vertex-disjoint edges whose weights sum to the maximum possible value.
+
+This is a well-studied combinatorial optimization problem, with many known solutions.
+The most efficient algorithm known to the author for maximum weight matching of a bipartite graph with strictly integral weights
+is from Duan and Su (2012). For a graph with m edges, n vertices per side, and maximum integer edge weight N, 
+their algorithm runs in **O(m sqrt(n) log(N))** time. As the graph representation of a pairSEQ experiment is 
+bipartite with integer weights, this algorithm is ideal for BiGpairSEQ.
+
+Unfortunately, it's a fairly new algorithm, and not yet implemented by the graph theory library used in this simulator.
+So this program instead uses the Fibonacci heap-based algorithm of Fredman and Tarjan (1987), which has a worst-case
+runtime of **O(n (n log(n) + m))**. The algorithm is implemented as described in Melhorn and Näher (1999).
+
+The current version of the program uses a pairing heap instead of a Fibonacci heap for its priority queue,
+which has lower theoretical efficiency but also lower complexity overhead, and is often equivalently performant
+in practice.
+
+## USAGE
+
+### RUNNING THE PROGRAM
+
+BiGpairSEQ_Sim is an executable .jar file. Requires Java 11 or higher. [OpenJDK 17](https://jdk.java.net/17/)
+recommended.
+
+Run with the command:
+
+`java -jar BiGpairSEQ_Sim.jar`
+
+Processing sample plates with tens of thousands of sequences may require large amounts 
+of RAM. It is often desirable to increase the JVM maximum heap allocation with the -Xmx flag.
+For example, to run the program with 32 gigabytes of memory, use the command:
+
+`java -Xmx32G -jar BiGpairSEQ_Sim.jar`
+
+Once running, BiGpairSEQ_Sim has an interactive, menu-driven CLI for generating files and simulating TCR pairing. The
+main menu looks like this:
+
+```
+--------BiGPairSEQ SIMULATOR--------
+ALPHA/BETA T CELL RECEPTOR MATCHING
+  USING WEIGHTED BIPARTITE GRAPHS
+------------------------------------
+Please select an option:
+1) Generate a population of distinct cells
+2) Generate a sample plate of T cells
+3) Generate CDR3 alpha/beta occupancy data and overlap graph
+4) Simulate bipartite graph CDR3 alpha/beta matching (BiGpairSEQ)
+9) About/Acknowledgments
+0) Exit
+```
+
+### OUTPUT
+
+To run the simulation, the program reads and writes 4 kinds of files:
+* Cell Sample files in CSV format
+* Sample Plate files in CSV format
+* Graph and Data files in binary object serialization format
+* Matching Results files in CSV format
+
+When entering filenames, it is not necessary to include the file extension (.csv or .ser). When reading or
+writing files, the program will automatically add the correct extension to any filename without one.
+
+#### Cell Sample Files
+Cell Sample files consist of any number of distinct "T cells." Every cell contains 
+four sequences: Alpha CDR3, Beta CDR3, Alpha CDR1, Beta CDR1. The sequences are represented by
+random integers. CDR3 Alpha and Beta sequences are all unique within a given Cell Sample file. CDR1 Alpha and Beta sequences
+are not necessarily unique; the relative diversity can be set when making the file.
+
+(Note: though cells still have CDR1 sequences, matching of CDR1s is currently awaiting re-implementation.)
+
+Options when making a Cell Sample file:
+* Number of T cells to generate
+* Factor by which CDR3s are more diverse than CDR1s
+
+Files are in CSV format. Rows are distinct T cells, columns are sequences within the cells.
+Comments are preceded by `#`
+
+Structure:
+
+---
+    # Sample contains 1 unique CDR1 for every 4 unique CDR3s.
+| Alpha CDR3 | Beta CDR3 | Alpha CDR1 | Beta CDR1 |
+|---|---|---|---|
+|unique number|unique number|number|number|
+
+---
+
+#### Sample Plate Files
+Sample Plate files consist of any number of "wells" containing any number of T cells (as 
+described above). The wells are filled randomly from a Cell Sample file, according to a selected
+frequency distribution. Additionally, every individual sequence within each cell may, with some
+given dropout probability, be omitted from the file; this simulates the effect of amplification errors
+prior to sequencing. Plates can also be partitioned into any number of sections, each of which can have a 
+different concentration of T cells per well.
+
+Options when making a Sample Plate file:
+* Cell Sample file to use
+* Statistical distribution to apply to Cell Sample file
+  * Poisson
+  * Gaussian
+    * Standard deviation size 
+  * Exponential
+    * Lambda value
+      * (Based on the slope of the graph in Figure 4C of the pairSEQ paper, the distribution of the original experiment was exponential with a lambda of approximately 0.6. (Howie, et al. 2015))
+* Total number of wells on the plate
+* Number of sections on plate
+* Number of T cells per well
+  * per section, if more than one section
+* Dropout rate
+
+Files are in CSV format. There are no header labels. Every row represents a well. 
+Every column represents an individual cell, containing four sequences, depicted as an array string:
+`[CDR3A, CDR3B, CDR1A, CDR1B]`. So a representative cell might look like this: 
+
+`[525902, 791533, -1, 866282]`
+
+Notice that the CDR1 Alpha is missing in the cell above--sequence dropout from simulated amplification error.
+Dropout sequences are replaced with the value `-1`. Comments are preceded by `#`
+
+Structure:
+
+---
+```
+# Cell source file name:
+# Each row represents one well on the plate
+# Plate size:
+# Concentrations:
+# Lambda -or- StdDev: 
+```
+| Well 1, cell 1 | Well 1, cell 2 | Well 1, cell 3| ... |
+|---|---|---|---|
+| **Well 2, cell 1** | **Well 2, cell 2** | **Well 2, cell 3**| **...** |
+| **Well 3, cell 1** | **Well 3, cell 2** | **Well 3, cell 3**| **...** |
+| **...** | **...** | **...** | **...** |
+
+---
+
+#### Graph and Data Files
+Graph and Data files are serialized binaries of a Java object containing the weigthed bipartite graph representation of a
+Sample Plate, along with the necessary metadata for matching and results output. Making them requires a Cell Sample file 
+(to construct a list of correct sequence pairs for checking the accuracy of BiGpairSEQ simulations) and a 
+Sample Plate file (to construct the associated occupancy graph). These files can be several gigabytes in size.
+Writing them to a file lets us generate a graph and its metadata once, then use it for multiple different BiGpairSEQ simulations.
+
+Options for creating a Graph and Data file:
+* The Cell Sample file to use
+* The Sample Plate file to use. (This must have been generated from the selected Cell Sample file.)
+
+These files do not have a human-readable structure, and are not portable to other programs. (Export of graphs in a 
+portable data format may be implemented in the future. The tricky part is encoding the necessary metadata.)
+
+---
+
+#### Matching Results Files 
+Matching results files consist of the results of a BiGpairSEQ matching simulation.
+Files are in CSV format. Rows are sequence pairings with extra relevant data. Columns are pairing-specific details.
+Metadata about the matching simulation is included as comments. Comments are preceded by `#`.
+
+Options when running a BiGpairSEQ simulation of CDR3 alpha/beta matching:
+* The minimum number of alpha/beta overlap wells to attempt to match
+  * (must be >= 1)
+* The maximum number of alpha/beta overlap wells to attempt to match
+  * (must be <= the number of wells on the plate - 1)
+* The maximum difference in alpha/beta occupancy to attempt to match
+  * (Optional. To skip using this filter, enter a value >= the number of wells on the plate)
+* The minimum overlap percentage--the percentage of a sequence's occupied wells shared by another sequence--to attempt to match. Given as value in range 0 - 100.
+  * (Optional. To skip using this filter, enter 0)
+
+Example output:
+
+---
+```
+# Source Sample Plate file: 4MilCellsPlate.csv
+# Source Graph and Data file: 4MilCellsPlateGraph.ser
+# T cell counts in sample plate wells: 30000
+# Total alphas found: 11813
+# Total betas found: 11808
+# High overlap threshold: 94
+# Low overlap threshold: 3
+# Minimum overlap percent: 0
+# Maximum occupancy difference: 96
+# Pairing attempt rate: 0.438
+# Correct pairings: 5151
+# Incorrect pairings: 18
+# Pairing error rate: 0.00348
+# Simulation time: 862 seconds
+```
+
+| Alpha | Alpha well count | Beta | Beta well count | Overlap count | Matched Correctly? | P-value |
+|---|---|---|---|---|---|---|
+|5242972|17|1571520|18|17|true|1.41E-18|
+|5161027|18|2072219|18|18|true|7.31E-20|
+|4145198|33|1064455|30|29|true|2.65E-21|
+|7700582|18|112748|18|18|true|7.31E-20|
+|...|...|...|...|...|...|...|
+
+---
+
+**NOTE: The p-values in the output are not used for matching**—they aren't part of the BiGpairSEQ algorithm at all. 
+P-values are calculated *after* BiGpairSEQ matching is completed, for purposes of comparison only, 
+using the (2021 corrected) formula from the original pairSEQ paper. (Howie, et al. 2015)
+
+### PERFORMANCE
+Performance details of the example excerpted above:
+
+On a home computer with a Ryzen 5600X CPU, 64GB of 3200MHz DDR4 RAM (half of which was allocated to the Java Virtual Machine), and a PCIe 3.0 SSD, running Linux Mint 20.3 Edge (5.13 kernel), 
+the author ran a BiGpairSEQ simulation of a 96-well sample plate with 30,000 T cells/well comprising ~11,800 alphas and betas,
+taken from a sample of 4,000,000 distinct cells with an exponential frequency distribution.
+
+With min/max occupancy threshold of 3 and 94 wells for matching, and no other pre-filtering, BiGpairSEQ identified 5,151 
+correct pairings and 18 incorrect pairings, for an accuracy of 99.652%.
+
+The simulation time was 14'22". If intermediate results were held in memory, this would be equivalent to the total elapsed time.
+
+Since this implementation of BiGpairSEQ writes intermediate results to disk (to improve the efficiency of *repeated* simulations
+with different filtering options), the actual elapsed time was greater. File I/O time was not measured, but took 
+slightly less time than the simulation itself. Real elapsed time from start to finish was under 30 minutes.
+
+## TODO
+
+* ~~Try invoking GC at end of workloads to reduce paging to disk~~ DONE
+* ~~Hold graph data in memory until another graph is read-in?~~ ABANDONED
+  * *No, this won't work, because BiGpairSEQ simulations alter the underlying graph based on filtering constraints. Changes would cascade with multiple experiments.*
+* See if there's a reasonable way to reformat Sample Plate files so that wells are columns instead of rows. 
+  * ~~Problem is variable number of cells in a well~~
+  * ~~Apache Commons CSV library writes entries a row at a time~~ 
+    * _Got this working, but at the cost of a profoundly strange bug in graph occupancy filtering. Have reverted the repo until I can figure out what caused that. Given how easily Thingiverse transposes CSV matrices in R, might not even be worth fixing._
+* Re-implement command line arguments, to enable scripting and statistical simulation studies
+* Implement sample plates with random numbers of T cells per well.
+  * Possible BiGpairSEQ advantage over pairSEQ: BiGpairSEQ is resilient to variations in well population sizes on a sample plate; pairSEQ is not.
+    * preliminary data suggests that BiGpairSEQ behaves roughly as though the whole plate had whatever the *average* well concentration is, but that's still speculative.
+* Enable GraphML output in addition to serialized object binaries, for data portability
+  * Custom vertex type with attribute for sequence occupancy?
+* Re-implement CDR1 matching method
+* Implement Duan and Su's maximum weight matching algorithm
+  * Add controllable algorithm-type parameter?
+* Test whether pairing heap (currently used) or Fibonacci heap is more efficient for priority queue in current matching algorithm
+  * in theory Fibonacci heap should be more efficient, but complexity overhead may eliminate theoretical advantage
+  * Add controllable heap-type parameter?
+
+
+  
+## CITATIONS
+* Howie, B., Sherwood, A. M., et al. ["High-throughput pairing of T cell receptor alpha and beta sequences."](https://pubmed.ncbi.nlm.nih.gov/26290413/) Sci. Transl. Med. 7, 301ra131 (2015)
+* Duan, R., Su H. ["A Scaling Algorithm for Maximum Weight Matching in Bipartite Graphs."](https://web.eecs.umich.edu/~pettie/matching/Duan-Su-scaling-bipartite-matching.pdf) Proceedings of the Twenty-Third Annual ACM-SIAM Symposium on Discrete Algorithms, p. 1413-1424. (2012)
+* Melhorn, K., Näher, St. [The LEDA Platform of Combinatorial and Geometric Computing.](https://people.mpi-inf.mpg.de/~mehlhorn/LEDAbook.html) Cambridge University Press. Chapter 7, Graph Algorithms; p. 132-162 (1999)
+* Fredman, M., Tarjan, R. ["Fibonacci heaps and their uses in improved network optimization algorithms."](https://www.cl.cam.ac.uk/teaching/1011/AlgorithII/1987-FredmanTar-fibonacci.pdf) J. ACM, 34(3):596–615 (1987))
+
+## EXTERNAL LIBRARIES USED
+* [JGraphT](https://jgrapht.org) -- Graph theory data structures and algorithms
+* [JHeaps](https://www.jheaps.org) -- For pairing heap priority queue used in maximum weight matching algorithm
+* [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/) -- For CSV file output
+* [Apache Commons CLI](https://commons.apache.org/proper/commons-cli/) -- To enable command line arguments for scripting. (**Awaiting re-implementation**.)
+
+## ACKNOWLEDGEMENTS
+BiGpairSEQ was conceived in collaboration with Dr. Alice MacQueen, who brought the original 
+pairSEQ paper to the author's attention and explained all the biology terms he didn't know.
+
+## AUTHOR
+BiGpairSEQ algorithm and simulation by Eugene Fischer, 2021. UI improvements and documentation, 2022.
--- a/src/main/java/Equations.java
+++ b/src/main/java/Equations.java
@@ -8,6 +8,9 @@ public abstract class Equations {
        return (int) ((Math.random() * (max - min)) + min);
    }

+    //pValue calculation as described in original pairSEQ paper.
+    //Included for comparison with original results.
+    //Not used by BiGpairSEQ for matching.
    public static double pValue(Integer w, Integer w_a, Integer w_b, double w_ab_d) {
        int w_ab = (int) w_ab_d;
        double pv = 0.0;
@@ -18,6 +21,9 @@ public abstract class Equations {
        return pv;
    }

+    //Implementation of the (corrected) probability equation from pairSEQ paper.
+    //Included for comparison with original results.
+    //Not used by BiGpairSEQ for matching.
    private static double probPairedByChance(Integer w, Integer w_a, Integer w_b, Integer w_ab){
        BigInteger numer1 = choose(w, w_ab);
        BigInteger numer2 = choose(w - w_ab, w_a - w_ab);
@@ -30,10 +36,9 @@ public abstract class Equations {
        return prob.doubleValue();
    }

-    /*
-     * This works because nC(k+1) = nCk * (n-k)/(k+1)
-     * Since nC0 = 1, can start there and generate all the rest.
-     */
+
+     //This works because nC(k+1) = nCk * (n-k)/(k+1)
+     //Since nC0 = 1, can start there and generate all the rest.
     public static BigInteger choose(final int N, final int K) {
         BigInteger nCk = BigInteger.ONE;
         for (int k = 0; k < K; k++) {
--- a/src/main/java/GraphDataObjectReader.java
+++ b/src/main/java/GraphDataObjectReader.java
@@ -0,0 +1,29 @@
+import java.io.*;
+
+public class GraphDataObjectReader {
+    private GraphWithMapData data;
+    private String filename;
+
+    public GraphDataObjectReader(String filename) throws IOException {
+        if(!filename.matches(".*\\.ser")){
+            filename = filename + ".ser";
+        }
+        this.filename = filename;
+        try(//don't need to close these because of try-with-resources
+            BufferedInputStream fileIn = new BufferedInputStream(new FileInputStream(filename));
+                ObjectInputStream in = new ObjectInputStream(fileIn))
+        {
+            data = (GraphWithMapData) in.readObject();
+        } catch (FileNotFoundException | ClassNotFoundException ex) {
+            ex.printStackTrace();
+        }
+    }
+
+    public GraphWithMapData getData() {
+        return data;
+    }
+
+    public String getFilename() {
+        return filename;
+    }
+}
--- a/src/main/java/GraphDataObjectWriter.java
+++ b/src/main/java/GraphDataObjectWriter.java
@@ -0,0 +1,28 @@
+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+
+public class GraphDataObjectWriter {
+
+    private GraphWithMapData data;
+    private String filename;
+
+    public GraphDataObjectWriter(String filename, GraphWithMapData data) {
+        if(!filename.matches(".*\\.ser")){
+            filename = filename + ".ser";
+        }
+        this.filename = filename;
+        this.data = data;
+    }
+
+    public void writeDataToFile() {
+        try (BufferedOutputStream bufferedOut = new BufferedOutputStream(new FileOutputStream(filename));
+             ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
+        ){
+            out.writeObject(data);
+        } catch (IOException ex) {
+            ex.printStackTrace();
+        }
+    }
+}
--- a/src/main/java/GraphMLFileReader.java
+++ b/src/main/java/GraphMLFileReader.java
@@ -0,0 +1,35 @@
+import org.jgrapht.graph.SimpleWeightedGraph;
+import org.jgrapht.nio.graphml.GraphMLImporter;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+public class GraphMLFileReader {
+
+    private String filename;
+    private SimpleWeightedGraph graph;
+
+    public GraphMLFileReader(String filename, SimpleWeightedGraph graph) {
+        if(!filename.matches(".*\\.graphml")){
+            filename = filename + ".graphml";
+        }
+        this.filename = filename;
+        this.graph = graph;
+
+        try(//don't need to close reader bc of try-with-resources auto-closing
+            BufferedReader reader = Files.newBufferedReader(Path.of(filename));
+        ){
+            GraphMLImporter<SimpleWeightedGraph, BufferedReader> importer = new GraphMLImporter<>();
+            importer.importGraph(graph, reader);
+        }
+        catch (IOException ex) {
+            System.out.println("Graph file " + filename + " not found.");
+            System.err.println(ex);
+        }
+    }
+
+    public SimpleWeightedGraph getGraph() { return graph; }
+
+}
--- a/src/main/java/GraphMLFileWriter.java
+++ b/src/main/java/GraphMLFileWriter.java
@@ -0,0 +1,35 @@
+import org.jgrapht.graph.SimpleWeightedGraph;
+import org.jgrapht.nio.dot.DOTExporter;
+import org.jgrapht.nio.graphml.GraphMLExporter;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+public class GraphMLFileWriter {
+
+    String filename;
+    SimpleWeightedGraph graph;
+
+
+    public GraphMLFileWriter(String filename, SimpleWeightedGraph graph) {
+        if(!filename.matches(".*\\.graphml")){
+            filename = filename + ".graphml";
+        }
+        this.filename = filename;
+        this.graph = graph;
+    }
+
+    public void writeGraphToFile() {
+        try(BufferedWriter writer = Files.newBufferedWriter(Path.of(filename), StandardOpenOption.CREATE_NEW);
+        ){
+            GraphMLExporter<SimpleWeightedGraph, BufferedWriter> exporter = new GraphMLExporter<>();
+            exporter.exportGraph(graph, writer);
+        } catch(IOException ex){
+            System.out.println("Could not make new file named "+filename);
+            System.err.println(ex);
+        }
+    }
+}
--- a/src/main/java/GraphWithMapData.java
+++ b/src/main/java/GraphWithMapData.java
@@ -0,0 +1,106 @@
+import org.jgrapht.graph.SimpleWeightedGraph;
+
+import java.time.Duration;
+import java.util.Map;
+
+//Can't just write the graph, because I need the occupancy data too.
+//Makes most sense to serialize object and write that to a file.
+//Which means there's no reason to split map data and graph data up.
+public class GraphWithMapData implements java.io.Serializable {
+
+    private String sourceFilename;
+    private final SimpleWeightedGraph graph;
+    private Integer numWells;
+    private Integer[] wellConcentrations;
+    private Integer alphaCount;
+    private Integer betaCount;
+    private final Map<Integer, Integer> distCellsMapAlphaKey;
+    private final Map<Integer, Integer> plateVtoAMap;
+    private final Map<Integer, Integer> plateVtoBMap;
+    private final Map<Integer, Integer> plateAtoVMap;
+    private final Map<Integer, Integer> plateBtoVMap;
+    private final Map<Integer, Integer> alphaWellCounts;
+    private final Map<Integer, Integer> betaWellCounts;
+    private final Duration time;
+
+    public GraphWithMapData(SimpleWeightedGraph graph, Integer numWells, Integer[] wellConcentrations,
+                            Integer alphaCount, Integer betaCount,
+                            Map<Integer, Integer> distCellsMapAlphaKey, Map<Integer, Integer> plateVtoAMap,
+                            Map<Integer,Integer> plateVtoBMap, Map<Integer, Integer> plateAtoVMap,
+                            Map<Integer, Integer> plateBtoVMap, Map<Integer, Integer> alphaWellCounts,
+                            Map<Integer, Integer> betaWellCounts, Duration time) {
+        this.graph = graph;
+        this.numWells = numWells;
+        this.wellConcentrations = wellConcentrations;
+        this.alphaCount = alphaCount;
+        this.betaCount = betaCount;
+        this.distCellsMapAlphaKey = distCellsMapAlphaKey;
+        this.plateVtoAMap = plateVtoAMap;
+        this.plateVtoBMap = plateVtoBMap;
+        this.plateAtoVMap = plateAtoVMap;
+        this.plateBtoVMap = plateBtoVMap;
+        this.alphaWellCounts = alphaWellCounts;
+        this.betaWellCounts = betaWellCounts;
+        this.time = time;
+    }
+
+    public SimpleWeightedGraph getGraph() {
+        return graph;
+    }
+
+    public Integer getNumWells() {
+        return numWells;
+    }
+
+    public Integer[] getWellConcentrations() {
+        return wellConcentrations;
+    }
+
+    public Integer getAlphaCount() {
+        return alphaCount;
+    }
+
+    public Integer getBetaCount() {
+        return betaCount;
+    }
+
+    public Map<Integer, Integer> getDistCellsMapAlphaKey() {
+        return distCellsMapAlphaKey;
+    }
+
+    public Map<Integer, Integer> getPlateVtoAMap() {
+        return plateVtoAMap;
+    }
+
+    public Map<Integer, Integer> getPlateVtoBMap() {
+        return plateVtoBMap;
+    }
+
+    public Map<Integer, Integer> getPlateAtoVMap() {
+        return plateAtoVMap;
+    }
+
+    public Map<Integer, Integer> getPlateBtoVMap() {
+        return plateBtoVMap;
+    }
+
+    public Map<Integer, Integer> getAlphaWellCounts() {
+        return alphaWellCounts;
+    }
+
+    public Map<Integer, Integer> getBetaWellCounts() {
+        return betaWellCounts;
+    }
+
+    public Duration getTime() {
+        return time;
+    }
+
+    public void setSourceFilename(String filename) {
+        this.sourceFilename = filename;
+    }
+
+    public String getSourceFilename() {
+        return sourceFilename;
+    }
+}
--- a/src/main/java/Plate.java
+++ b/src/main/java/Plate.java
@@ -21,7 +21,6 @@ public class Plate {
        this.size = size;
        this.error = error;
        this.concentrations = concentrations;
-        //this.stdDev = stdDev;
        wells = new ArrayList<>();
    }

@@ -29,6 +28,17 @@ public class Plate {
        this.sourceFile = sourceFileName;
        this.wells = wells;
        this.size = wells.size();
+
+        List<Integer> concentrations = new ArrayList<>();
+        for (List<Integer[]> w: wells) {
+            if(!concentrations.contains(w.size())){
+                concentrations.add(w.size());
+            }
+        }
+        this.concentrations = new Integer[concentrations.size()];
+        for (int i = 0; i < this.concentrations.length; i++) {
+            this.concentrations[i] = concentrations.get(i);
+        }
    }

    public void fillWellsExponential(String sourceFileName, List<Integer[]> cells, double lambda){
@@ -45,6 +55,7 @@ public class Plate {
                List<Integer[]> well = new ArrayList<>();
                for (int j = 0; j < concentrations[section]; j++) {
                    do {
+                        //inverse transform sampling: for random number u in [0,1), x = log(1-u) / (-lambda)
                        m = (Math.log10((1 - rand.nextDouble()))/(-lambda)) * Math.sqrt(cells.size());
                    } while (m >= cells.size() || m < 0);
                    n = (int) Math.floor(m);
@@ -57,7 +68,7 @@ public class Plate {
                    }
                    Integer[] cellToAdd = cells.get(n).clone();
                    for(int k = 0; k < cellToAdd.length; k++){
-                        if(Math.abs(rand.nextDouble()) < error){//error applied to each peptide
+                        if(Math.abs(rand.nextDouble()) < error){//error applied to each seqeunce
                            cellToAdd[k] = -1;
                        }
                    }
@@ -87,7 +98,7 @@ public class Plate {
                    n = (int) Math.floor(m);
                    Integer[] cellToAdd = cells.get(n).clone();
                    for(int k = 0; k < cellToAdd.length; k++){
-                        if(Math.abs(rand.nextDouble()) < error){//error applied to each peptide
+                        if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
                            cellToAdd[k] = -1;
                        }
                    }
@@ -123,30 +134,30 @@ public class Plate {
        return wells;
    }

-    //returns a map of the counts of the peptide at cell index pIndex, in all wells
-    public Map<Integer, Integer> assayWellsPeptideP(int... pIndices){
-        return this.assayWellsPeptideP(0, size, pIndices);
+    //returns a map of the counts of the sequence at cell index sIndex, in all wells
+    public Map<Integer, Integer> assayWellsSequenceS(int... sIndices){
+        return this.assayWellsSequenceS(0, size, sIndices);
    }

-    //returns a map of the counts of the peptide at cell index pIndex, in a specific well
-    public Map<Integer, Integer> assayWellsPeptideP(int n, int... pIndices) { return this.assayWellsPeptideP(n, n+1, pIndices);}
+    //returns a map of the counts of the sequence at cell index sIndex, in a specific well
+    public Map<Integer, Integer> assayWellsSequenceS(int n, int... sIndices) { return this.assayWellsSequenceS(n, n+1, sIndices);}

-    //returns a map of the counts of the peptide at cell index pIndex, in a range of wells
-    public Map<Integer, Integer> assayWellsPeptideP(int start, int end, int... pIndices) {
+    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
+    public Map<Integer, Integer> assayWellsSequenceS(int start, int end, int... sIndices) {
        Map<Integer,Integer> assay = new HashMap<>();
-        for(int pIndex: pIndices){
+        for(int pIndex: sIndices){
            for(int i = start; i < end; i++){
-                countPeptides(assay, wells.get(i), pIndex);
+                countSequences(assay, wells.get(i), pIndex);
            }
        }
        return assay;
    }
-    //For the peptides at cell indices pIndices, counts number of unique peptides in the given well into the given map
-    private void countPeptides(Map<Integer, Integer> wellMap, List<Integer[]> well, int... pIndices) {
+    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
+    private void countSequences(Map<Integer, Integer> wellMap, List<Integer[]> well, int... sIndices) {
        for(Integer[] cell : well) {
-            for(int pIndex: pIndices){
-                if(cell[pIndex] != -1){
-                    wellMap.merge(cell[pIndex], 1, (oldValue, newValue) -> oldValue + newValue);
+            for(int sIndex: sIndices){
+                if(cell[sIndex] != -1){
+                    wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
        }
--- a/src/main/java/PlateFileWriter.java
+++ b/src/main/java/PlateFileWriter.java
@@ -73,6 +73,7 @@ public class PlateFileWriter {
            }
            rows.add(tmp);
        }
+        //build string of well concentrations
        StringBuilder concen = new StringBuilder();
        for(Integer i: concentrations){
            concen.append(i.toString());
@@ -80,7 +81,9 @@ public class PlateFileWriter {
        }
        String concenString = concen.toString();

-        CSVFormat plateFileFormat = CSVFormat.Builder.create().setCommentMarker('#').build();
+        CSVFormat plateFileFormat = CSVFormat.Builder.create()
+                .setCommentMarker('#')
+                .build();

        try(BufferedWriter writer = Files.newBufferedWriter(Path.of(filename), StandardOpenOption.CREATE_NEW);
            CSVPrinter printer = new CSVPrinter(writer, plateFileFormat);
--- a/src/main/java/Simulator.java
+++ b/src/main/java/Simulator.java
@@ -1,84 +1,27 @@
-//import org.jgrapht.Graph;
 import org.jgrapht.alg.interfaces.MatchingAlgorithm;
 import org.jgrapht.alg.matching.MaximumWeightBipartiteMatching;
 import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator;
 import org.jgrapht.graph.DefaultWeightedEdge;
 import org.jgrapht.graph.SimpleWeightedGraph;
-import org.jheaps.AddressableHeap;
 import org.jheaps.tree.PairingHeap;

 import java.math.BigDecimal;
-import java.math.BigInteger;
 import java.math.MathContext;
 import java.text.NumberFormat;
 import java.time.Instant;
 import java.time.Duration;
 import java.util.*;
-import java.util.function.Function;
-import java.util.function.Supplier;
 import java.util.stream.IntStream;

+//NOTE: "sequence" in method and variable names refers to a peptide sequence from a simulated T cell
 public class Simulator {
-    private static int cdr3AlphaIndex = 0;
-    private static int cdr3BetaIndex = 1;
-    private static int cdr1AlphaIndex = 2;
-    private static int cdr1BetaIndex = 3;
+    private static final int cdr3AlphaIndex = 0;
+    private static final int cdr3BetaIndex = 1;
+    private static final int cdr1AlphaIndex = 2;
+    private static final int cdr1BetaIndex = 3;

-
-    //Tested generating the cell sample file itself with a set distribution, but it wasn't working well
-    //realized I'd have to change matching algos as well, so abandoned it.
-//    public static CellSample generateCellSampleExp(Integer numDistinctCells, Integer cdr1Freq){
-//        //was told I=in real T cells, CDR1s have about one third the diversity of CDR3s
-//        //this could be wrong, though. could be less diversity than that.
-//        //previous sim was only CDR3s
-//        //Integer numDistinctCells = 1000000;
-//        //int cdr1Freq = 3;
-//
-//        List<Integer> numbersCDR3 = new ArrayList<>();
-//        List<Integer> numbersCDR1 = new ArrayList<>();
-//        Integer numDistCDR3s = 2 * numDistinctCells + 1;
-//        IntStream.range(1, numDistCDR3s + 1).forEach(i -> numbersCDR3.add(i));
-//        IntStream.range(numDistCDR3s + 1, numDistCDR3s + 1 + (numDistCDR3s / cdr1Freq) + 1).forEach(i -> numbersCDR1.add(i));
-//        Collections.shuffle(numbersCDR3);
-//        Collections.shuffle(numbersCDR1);
-//
-//        //Each cell represented by 4 values
-//        //two CDR3s, and two CDR1s. First two values are CDR3s, second two are CDR1s
-//        List<Integer[]> distinctCells = new ArrayList<>();
-//        for(int i = 0; i < numbersCDR3.size() - 1; i = i + 2){
-//            Integer tmpCDR3a = numbersCDR3.get(i);
-//            Integer tmpCDR3b = numbersCDR3.get(i+1);
-//            Integer tmpCDR1a = numbersCDR1.get(i % numbersCDR1.size());
-//            Integer tmpCDR1b = numbersCDR1.get((i+1) % numbersCDR1.size());
-//            Integer[] tmp = {tmpCDR3a, tmpCDR3b, tmpCDR1a, tmpCDR1b};
-//            distinctCells.add(tmp);
-//        }
-//        List<Integer[]>sampleCells = new ArrayList<>();
-//        int count;
-//        int lastFactor = 0;
-//        int factor = 10;
-//        int index = 0;
-//        while(numDistinctCells / factor >= 1){
-//            int i = index;
-//            while(i < index + factor - lastFactor){
-//                count = 0;
-//                while(count < (numDistinctCells/(factor * 10))){
-//                    sampleCells.add(distinctCells.get(i));
-//                    count++;
-//                }
-//                i++;
-//            }
-//            index = i;
-//            lastFactor = factor;
-//            factor *=10;
-//        }
-//        System.out.println("Total cells in sample: " + sampleCells.size());
-//        System.out.println("Distinct cells in sample: " + index);
-//        return new CellSample(sampleCells, cdr1Freq);
-//    }
    public static CellSample generateCellSample(Integer numDistinctCells, Integer cdr1Freq) {
        //In real T cells, CDR1s have about one third the diversity of CDR3s
-        //previous sim was only CDR3s
        List<Integer> numbersCDR3 = new ArrayList<>();
        List<Integer> numbersCDR1 = new ArrayList<>();
        Integer numDistCDR3s = 2 * numDistinctCells + 1;
@@ -88,7 +31,7 @@ public class Simulator {
        Collections.shuffle(numbersCDR1);

        //Each cell represented by 4 values
-        //two CDR3s, and two CDR1s. First two values are CDR3s, second two are CDR1s
+        //two CDR3s, and two CDR1s. First two values are CDR3s (alpha, beta), second two are CDR1s (alpha, beta)
        List<Integer[]> distinctCells = new ArrayList<>();
        for(int i = 0; i < numbersCDR3.size() - 1; i = i + 2){
            Integer tmpCDR3a = numbersCDR3.get(i);
@@ -101,43 +44,39 @@ public class Simulator {
        return new CellSample(distinctCells, cdr1Freq);
    }

-    public static MatchingResult matchCDR3s(List<Integer[]> distinctCells,
-                                            Plate samplePlate, Integer lowThreshold,
-                                            Integer highThreshold, boolean verbose){
-        if(verbose){System.out.println("Cells: " + distinctCells.size());}
-
+    //Make the graph needed for matching CDR3s
+    public static GraphWithMapData makeGraph(List<Integer[]> distinctCells, Plate samplePlate, boolean verbose) {
        Instant start = Instant.now();
-        int numWells = samplePlate.getSize();
        int[] alphaIndex = {cdr3AlphaIndex};
        int[] betaIndex = {cdr3BetaIndex};
+        int numWells = samplePlate.getSize();

        if(verbose){System.out.println("Making cell maps");}
        //HashMap keyed to Alphas, values Betas
-        Map<Integer, Integer> distCellsMapAlphaKey = makePeptideToPeptideMap(distinctCells, 0, 1);
+        Map<Integer, Integer> distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells, 0, 1);
        if(verbose){System.out.println("Cell maps made");}

        if(verbose){System.out.println("Making well maps");}
-        Map<Integer, Integer> allAlphas = samplePlate.assayWellsPeptideP(alphaIndex);
-        Map<Integer, Integer> allBetas = samplePlate.assayWellsPeptideP(betaIndex);
+        Map<Integer, Integer> allAlphas = samplePlate.assayWellsSequenceS(alphaIndex);
+        Map<Integer, Integer> allBetas = samplePlate.assayWellsSequenceS(betaIndex);
        int alphaCount = allAlphas.size();
-        if(verbose){System.out.println("all alphas count: " + alphaCount);}
+        if(verbose){System.out.println("All alphas count: " + alphaCount);}
        int betaCount = allBetas.size();
-        if(verbose){System.out.println("all betas count: " + betaCount);}
+        if(verbose){System.out.println("All betas count: " + betaCount);}

        if(verbose){System.out.println("Well maps made");}

-        //Remove saturating-occupancy peptides because they have no signal value.
-        //Remove below-minimum-overlap-threshold peptides because they can't possibly have an overlap with another
-        //peptide that's above the threshold.
-        if(verbose){System.out.println("Removing peptides present in all wells.");}
-        if(verbose){System.out.println("Removing peptides with occupancy below the minimum overlap threshold");}
-        filterByOccupancyThreshold(allAlphas, lowThreshold, numWells - 1);
-        filterByOccupancyThreshold(allBetas, lowThreshold, numWells - 1);
-        if(verbose){System.out.println("Peptides removed");}
+        //Remove saturating-occupancy sequences because they have no signal value.
+        //Remove sequences with total occupancy below minimum pair overlap threshold
+        if(verbose){System.out.println("Removing sequences present in all wells.");}
+        //if(verbose){System.out.println("Removing sequences with occupancy below the minimum overlap threshold");}
+        filterByOccupancyThreshold(allAlphas, 1, numWells - 1);
+        filterByOccupancyThreshold(allBetas, 1, numWells - 1);
+        if(verbose){System.out.println("Sequences removed");}
        int pairableAlphaCount = allAlphas.size();
-        if(verbose){System.out.println("Remaining alpha count: " + pairableAlphaCount);}
+        if(verbose){System.out.println("Remaining alphas count: " + pairableAlphaCount);}
        int pairableBetaCount = allBetas.size();
-        if(verbose){System.out.println("Remaining beta count: " + pairableBetaCount);}
+        if(verbose){System.out.println("Remaining betas count: " + pairableBetaCount);}

        if(verbose){System.out.println("Making vertex maps");}
        //For the SimpleWeightedBipartiteGraphMatrixGenerator, all vertices must have
@@ -147,17 +86,20 @@ public class Simulator {
        //subtract the vertexStartValue from betas to use their vertex labels as array indices
        Integer vertexStartValue = 0;
        //keys are sequential integer vertices, values are alphas
-        Map<Integer, Integer> plateVtoAMap = makeVertexToPeptideMap(allAlphas, vertexStartValue);
-        //New start value for vertex to beta map should be one more than final vertex value in alpha map
+        Map<Integer, Integer> plateVtoAMap = makeVertexToSequenceMap(allAlphas, vertexStartValue);
+        //new start value for vertex to beta map should be one more than final vertex value in alpha map
        vertexStartValue += plateVtoAMap.size();
        //keys are sequential integers vertices, values are betas
-        Map<Integer, Integer> plateVtoBMap = makeVertexToPeptideMap(allBetas, vertexStartValue);
+        Map<Integer, Integer> plateVtoBMap = makeVertexToSequenceMap(allBetas, vertexStartValue);
        //keys are alphas, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateAtoVMap = invertVertexMap(plateVtoAMap);
        //keys are betas, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateBtoVMap = invertVertexMap(plateVtoBMap);
        if(verbose){System.out.println("Vertex maps made");}

+        //make adjacency matrix for bipartite graph generator
+        //(technically this is only 1/4 of an adjacency matrix, but that's all you need
+        //for a bipartite graph, and all the SimpleWeightedBipartiteGraphMatrixGenerator class expects.)
        if(verbose){System.out.println("Creating adjacency matrix");}
        //Count how many wells each alpha appears in
        Map<Integer, Integer> alphaWellCounts = new HashMap<>();
@@ -165,54 +107,78 @@ public class Simulator {
        Map<Integer, Integer> betaWellCounts = new HashMap<>();
        //the adjacency matrix to be used by the graph generator
        double[][] weights = new double[plateVtoAMap.size()][plateVtoBMap.size()];
-        countPeptidesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap,
+        countSequencesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap,
                plateBtoVMap, alphaIndex, betaIndex, alphaWellCounts, betaWellCounts, weights);
-        if(verbose){System.out.println("matrix created");}
+        if(verbose){System.out.println("Matrix created");}

-        /**
-         * This is where the bipartite graph is created
-         */
-        if(verbose){System.out.println("creating graph");}
+        //create bipartite graph
+        if(verbose){System.out.println("Creating graph");}
        //the graph object
        SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph =
                new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
        //the graph generator
        SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
        //the list of alpha vertices
-        List<Integer> alphaVertices = new ArrayList<>();
-        alphaVertices.addAll(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
+        List<Integer> alphaVertices = new ArrayList<>(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.first(alphaVertices);
        //the list of beta vertices
-        List<Integer> betaVertices = new ArrayList<>();
-        betaVertices.addAll(plateVtoBMap.keySet());
+        List<Integer> betaVertices = new ArrayList<>(plateVtoBMap.keySet());
        graphGenerator.second(betaVertices); //This will work because LinkedHashMap preserves order of entry
+        //use adjacency matrix of weight created previously
        graphGenerator.weights(weights);
        graphGenerator.generateGraph(graph);
        if(verbose){System.out.println("Graph created");}

-        if(verbose){System.out.println("Eliminating edges with weights outside threshold values");}
+        Instant stop = Instant.now();
+        Duration time = Duration.between(start, stop);
+        //return GraphWithMapData object
+        return new GraphWithMapData(graph, numWells, samplePlate.getConcentrations(), alphaCount, betaCount,
+                distCellsMapAlphaKey, plateVtoAMap, plateVtoBMap, plateAtoVMap,
+                plateBtoVMap, alphaWellCounts, betaWellCounts, time);
+    }
+
+    //match CDR3s.
+    public static MatchingResult matchCDR3s(GraphWithMapData data, String dataFilename, Integer lowThreshold,
+                                            Integer highThreshold, Integer maxOccupancyDifference,
+                                            Integer minOverlapPercent, boolean verbose) {
+        Instant start = Instant.now();
+        int numWells = data.getNumWells();
+        Integer alphaCount = data.getAlphaCount();
+        Integer betaCount = data.getBetaCount();
+        Map<Integer, Integer> distCellsMapAlphaKey = data.getDistCellsMapAlphaKey();
+        Map<Integer, Integer> plateVtoAMap = data.getPlateVtoAMap();
+        Map<Integer, Integer> plateVtoBMap = data.getPlateVtoBMap();
+        Map<Integer, Integer> alphaWellCounts = data.getAlphaWellCounts();
+        Map<Integer, Integer> betaWellCounts = data.getBetaWellCounts();
+        SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph = data.getGraph();
+
+        //remove weights outside given overlap thresholds
+        if(verbose){System.out.println("Eliminating edges with weights outside overlap threshold values");}
        filterByOccupancyThreshold(graph, lowThreshold, highThreshold);
        if(verbose){System.out.println("Over- and under-weight edges set to 0.0");}

+        //Filter by overlap size
+        if(verbose){System.out.println("Eliminating edges with weights less than " + minOverlapPercent.toString() +
+                " percent of vertex occupancy value.");}
+        filterByOverlapSize(graph, alphaWellCounts, betaWellCounts, plateVtoAMap, plateVtoBMap, minOverlapPercent);
+        if(verbose){System.out.println("Edges with weights too far below vertex occupancy values set to 0.0");}

-        /**
-         * This is where the maximum weighted matching is found
-         */
-//        if(verbose){System.out.println("Finding maximum weighted matching");}
-//        MaximumWeightBipartiteMatching maxWeightMatching =
-//                new MaximumWeightBipartiteMatching(graph, plateVtoAMap.keySet(), plateVtoBMap.keySet());
-//        MatchingAlgorithm.Matching<String, DefaultWeightedEdge> graphMatching = maxWeightMatching.getMatching();
-//        if(verbose){System.out.println("Matching completed");}
-//        Instant stop = Instant.now();
-
-        //trying with jheaps now
+        //Filter by relative occupancy
+        if(verbose){System.out.println("Eliminating edges between vertices with occupancy difference > "
+                + maxOccupancyDifference);}
+        filterByRelativeOccupancy(graph, alphaWellCounts, betaWellCounts, plateVtoAMap, plateVtoBMap,
+                maxOccupancyDifference);
+        if(verbose){System.out.println("Edges between vertices of with excessively different occupancy values " +
+                "set to 0.0");}
+        //Find Maximum Weighted Matching
+        //using jheaps library class PairingHeap for improved efficiency
        if(verbose){System.out.println("Finding maximum weighted matching");}
        //Attempting to use addressable heap to improve performance
        MaximumWeightBipartiteMatching maxWeightMatching =
                new MaximumWeightBipartiteMatching(graph,
                        plateVtoAMap.keySet(),
                        plateVtoBMap.keySet(),
-                        (i) -> new PairingHeap(Comparator.naturalOrder()));
+                        i -> new PairingHeap(Comparator.naturalOrder()));
        MatchingAlgorithm.Matching<String, DefaultWeightedEdge> graphMatching = maxWeightMatching.getMatching();
        if(verbose){System.out.println("Matching completed");}
        Instant stop = Instant.now();
@@ -227,16 +193,15 @@ public class Simulator {
        header.add("Matched correctly?");
        header.add("P-value");

-
        //Results for csv file
        List<List<String>> allResults = new ArrayList<>();
        NumberFormat nf = NumberFormat.getInstance(Locale.US);
        MathContext mc = new MathContext(3);
        Iterator<DefaultWeightedEdge> weightIter = graphMatching.iterator();
-        DefaultWeightedEdge e = null;
+        DefaultWeightedEdge e;
        int trueCount = 0;
        int falseCount = 0;
-        boolean check = false;
+        boolean check;
        Map<Integer, Integer> matchMap = new HashMap<>();
        while(weightIter.hasNext()) {
            e = weightIter.next();
@@ -269,31 +234,44 @@ public class Simulator {
        }

        //Metadate comments for CSV file
-        int min = alphaCount > betaCount ? betaCount : alphaCount;
+        int min = Math.min(alphaCount, betaCount);
        double attemptRate = (double) (trueCount + falseCount) / min;
        BigDecimal attemptRateTrunc = new BigDecimal(attemptRate, mc);
        double pairingErrorRate = (double) falseCount / (trueCount + falseCount);
        BigDecimal pairingErrorRateTrunc = new BigDecimal(pairingErrorRate, mc);
+        //get list of well concentrations
+        List<Integer> wellConcentrations = Arrays.asList(data.getWellConcentrations());
+        //make string out of concentrations list
+        StringBuilder concentrationStringBuilder = new StringBuilder();
+        for(Integer i: wellConcentrations){
+            concentrationStringBuilder.append(i.toString());
+            concentrationStringBuilder.append(" ");
+        }
+        String concentrationString = concentrationStringBuilder.toString();

        List<String> comments = new ArrayList<>();
+        comments.add("Source Sample Plate filename: " + data.getSourceFilename());
+        comments.add("Source Graph and Data filename: " + dataFilename);
+        comments.add("T cell counts in sample plate wells: " + concentrationString);
        comments.add("Total alphas found: " + alphaCount);
        comments.add("Total betas found: " + betaCount);
        comments.add("High overlap threshold: " + highThreshold);
        comments.add("Low overlap threshold: " + lowThreshold);
+        comments.add("Minimum overlap percent: " + minOverlapPercent);
+        comments.add("Maximum occupancy difference: " + maxOccupancyDifference);
        comments.add("Pairing attempt rate: " + attemptRateTrunc);
        comments.add("Correct pairings: " + trueCount);
        comments.add("Incorrect pairings: " + falseCount);
        comments.add("Pairing error rate: " + pairingErrorRateTrunc);
        Duration time = Duration.between(start, stop);
+        time = time.plus(data.getTime());
        comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
        if(verbose){
            for(String s: comments){
                System.out.println(s);
            }
        }
-
-
-        return new MatchingResult(samplePlate.getSourceFileName(), comments, header, allResults, matchMap, time);
+        return new MatchingResult(data.getSourceFilename(), comments, header, allResults, matchMap, time);
    }

    //Simulated matching of CDR1s to CDR3s. Requires MatchingResult from prior run of matchCDR3s.
@@ -313,13 +291,13 @@ public class Simulator {
        System.out.println("Previous match maps made");

        System.out.println("Making cell maps");
-        Map<Integer, Integer> alphaCDR3toCDR1Map = makePeptideToPeptideMap(distinctCells, cdr3AlphaIndex, cdr1AlphaIndex);
-        Map<Integer, Integer> betaCDR3toCDR1Map = makePeptideToPeptideMap(distinctCells, cdr3BetaIndex, cdr1BetaIndex);
+        Map<Integer, Integer> alphaCDR3toCDR1Map = makeSequenceToSequenceMap(distinctCells, cdr3AlphaIndex, cdr1AlphaIndex);
+        Map<Integer, Integer> betaCDR3toCDR1Map = makeSequenceToSequenceMap(distinctCells, cdr3BetaIndex, cdr1BetaIndex);
        System.out.println("Cell maps made");

        System.out.println("Making well maps");
-        Map<Integer, Integer> allCDR3s = samplePlate.assayWellsPeptideP(cdr3Indices);
-        Map<Integer, Integer> allCDR1s = samplePlate.assayWellsPeptideP(cdr1Indices);
+        Map<Integer, Integer> allCDR3s = samplePlate.assayWellsSequenceS(cdr3Indices);
+        Map<Integer, Integer> allCDR1s = samplePlate.assayWellsSequenceS(cdr1Indices);
        int CDR3Count = allCDR3s.size();
        System.out.println("all CDR3s count: " + CDR3Count);
        int CDR1Count = allCDR1s.size();
@@ -353,11 +331,11 @@ public class Simulator {
        // subtract the vertexStartValue from CDR1s to use their vertex labels as array indices
        Integer vertexStartValue = 0;
        //keys are sequential integer vertices, values are CDR3s
-        Map<Integer, Integer> plateVtoCDR3Map = makeVertexToPeptideMap(allCDR3s, vertexStartValue);
+        Map<Integer, Integer> plateVtoCDR3Map = makeVertexToSequenceMap(allCDR3s, vertexStartValue);
        //New start value for vertex to CDR1 map should be one more than final vertex value in CDR3 map
        vertexStartValue += plateVtoCDR3Map.size();
        //keys are sequential integers vertices, values are CDR1s
-        Map<Integer, Integer> plateVtoCDR1Map = makeVertexToPeptideMap(allCDR1s, vertexStartValue);
+        Map<Integer, Integer> plateVtoCDR1Map = makeVertexToSequenceMap(allCDR1s, vertexStartValue);
        //keys are CDR3s, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateCDR3toVMap = invertVertexMap(plateVtoCDR3Map);
        //keys are CDR1s, values are sequential integer vertices from previous map
@@ -374,7 +352,7 @@ public class Simulator {
        Map<Integer, Integer> wellNCDR3s = null;
        Map<Integer, Integer> wellNCDR1s = null;
        double[][] weights = new double[plateVtoCDR3Map.size()][plateVtoCDR1Map.size()];
-        countPeptidesAndFillMatrix(samplePlate, allCDR3s, allCDR1s, plateCDR3toVMap, plateCDR1toVMap,
+        countSequencesAndFillMatrix(samplePlate, allCDR3s, allCDR1s, plateCDR3toVMap, plateCDR1toVMap,
                cdr3Indices, cdr1Indices, cdr3WellCounts, cdr1WellCounts, weights);
        System.out.println("Matrix created");

@@ -383,11 +361,9 @@ public class Simulator {
                new SimpleWeightedGraph<>(DefaultWeightedEdge.class);

        SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
-        List<Integer> cdr3Vertices = new ArrayList<>();
-        cdr3Vertices.addAll(plateVtoCDR3Map.keySet()); //This will work because LinkedHashMap preserves order of entry
+        List<Integer> cdr3Vertices = new ArrayList<>(plateVtoCDR3Map.keySet()); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.first(cdr3Vertices);
-        List<Integer> cdr1Vertices = new ArrayList<>();
-        cdr1Vertices.addAll(plateVtoCDR1Map.keySet());
+        List<Integer> cdr1Vertices = new ArrayList<>(plateVtoCDR1Map.keySet());
        graphGenerator.second(cdr1Vertices); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.weights(weights);
        graphGenerator.generateGraph(graph);
@@ -407,7 +383,7 @@ public class Simulator {
        //first processing run
        Map<Integer, Integer> firstMatchCDR3toCDR1Map = new HashMap<>();
        Iterator<DefaultWeightedEdge> weightIter = graphMatching.iterator();
-        DefaultWeightedEdge e = null;
+        DefaultWeightedEdge e;
        while(weightIter.hasNext()){
            e = weightIter.next();
 //            if(graph.getEdgeWeight(e) < lowThreshold || graph.getEdgeWeight(e) > highThreshold) {
@@ -531,7 +507,7 @@ public class Simulator {
        comments.add("Correct matching rate: " + correctRate);
        NumberFormat nf = NumberFormat.getInstance(Locale.US);
        Duration time = Duration.between(start, stop);
-        time.plus(previousTime);
+        time = time.plus(previousTime);
        comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
        for(String s: comments){
            System.out.println(s);
@@ -605,38 +581,38 @@ public class Simulator {

    //Counts the well occupancy of the row peptides and column peptides into given maps, and
    //fills weights in the given 2D array
-    private static void countPeptidesAndFillMatrix(Plate samplePlate,
-                                                   Map<Integer,Integer> allRowPeptides,
-                                                   Map<Integer,Integer> allColumnPeptides,
-                                                   Map<Integer,Integer> rowPeptideToVertexMap,
-                                                   Map<Integer,Integer> columnPeptideToVertexMap,
-                                                   int[] rowPeptideIndices,
-                                                   int[] colPeptideIndices,
-                                                   Map<Integer, Integer> rowPeptideCounts,
-                                                   Map<Integer,Integer> columnPeptideCounts,
+    private static void countSequencesAndFillMatrix(Plate samplePlate,
+                                                    Map<Integer,Integer> allRowSequences,
+                                                    Map<Integer,Integer> allColumnSequences,
+                                                    Map<Integer,Integer> rowSequenceToVertexMap,
+                                                    Map<Integer,Integer> columnSequenceToVertexMap,
+                                                    int[] rowSequenceIndices,
+                                                    int[] colSequenceIndices,
+                                                    Map<Integer, Integer> rowSequenceCounts,
+                                                    Map<Integer,Integer> columnSequenceCounts,
                                                    double[][] weights){
-        Map<Integer, Integer> wellNRowPeptides = null;
-        Map<Integer, Integer> wellNColumnPeptides = null;
-        int vertexStartValue = rowPeptideToVertexMap.size();
+        Map<Integer, Integer> wellNRowSequences = null;
+        Map<Integer, Integer> wellNColumnSequences = null;
+        int vertexStartValue = rowSequenceToVertexMap.size();
        int numWells = samplePlate.getSize();
        for (int n = 0; n < numWells; n++) {
-            wellNRowPeptides = samplePlate.assayWellsPeptideP(n, rowPeptideIndices);
-            for (Integer a : wellNRowPeptides.keySet()) {
-                if(allRowPeptides.containsKey(a)){
-                    rowPeptideCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
+            wellNRowSequences = samplePlate.assayWellsSequenceS(n, rowSequenceIndices);
+            for (Integer a : wellNRowSequences.keySet()) {
+                if(allRowSequences.containsKey(a)){
+                    rowSequenceCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
-            wellNColumnPeptides = samplePlate.assayWellsPeptideP(n, colPeptideIndices);
-            for (Integer b : wellNColumnPeptides.keySet()) {
-                if(allColumnPeptides.containsKey(b)){
-                    columnPeptideCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
+            wellNColumnSequences = samplePlate.assayWellsSequenceS(n, colSequenceIndices);
+            for (Integer b : wellNColumnSequences.keySet()) {
+                if(allColumnSequences.containsKey(b)){
+                    columnSequenceCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
-            for (Integer i : wellNRowPeptides.keySet()) {
-                if(allRowPeptides.containsKey(i)){
-                    for (Integer j : wellNColumnPeptides.keySet()) {
-                        if(allColumnPeptides.containsKey(j)){
-                            weights[rowPeptideToVertexMap.get(i)][columnPeptideToVertexMap.get(j) - vertexStartValue] += 1.0;
+            for (Integer i : wellNRowSequences.keySet()) {
+                if(allRowSequences.containsKey(i)){
+                    for (Integer j : wellNColumnSequences.keySet()) {
+                        if(allColumnSequences.containsKey(j)){
+                            weights[rowSequenceToVertexMap.get(i)][columnSequenceToVertexMap.get(j) - vertexStartValue] += 1.0;
                        }
                    }
                }
@@ -666,19 +642,54 @@ public class Simulator {
        }
    }

-    private static Map<Integer, Integer> makePeptideToPeptideMap(List<Integer[]> cells, int keyPeptideIndex,
-                                                                 int valuePeptideIndex){
-        Map<Integer, Integer> keyPeptideToValuePeptideMap = new HashMap<>();
-        for (Integer[] cell : cells) {
-           keyPeptideToValuePeptideMap.put(cell[keyPeptideIndex], cell[valuePeptideIndex]);
+    //Remove edges for pairs with large occupancy discrepancy
+    private static void filterByRelativeOccupancy(SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph,
+                                                  Map<Integer, Integer> alphaWellCounts,
+                                                  Map<Integer, Integer> betaWellCounts,
+                                                  Map<Integer, Integer> plateVtoAMap,
+                                                  Map<Integer, Integer> plateVtoBMap,
+                                                  Integer maxOccupancyDifference) {
+        for (DefaultWeightedEdge e : graph.edgeSet()) {
+            Integer alphaOcc = alphaWellCounts.get(plateVtoAMap.get(graph.getEdgeSource(e)));
+            Integer betaOcc = betaWellCounts.get(plateVtoBMap.get(graph.getEdgeTarget(e)));
+            //Adjust this to something cleverer later
+            if (Math.abs(alphaOcc - betaOcc) >= maxOccupancyDifference) {
+                graph.setEdgeWeight(e, 0.0);
+            }
        }
-        return keyPeptideToValuePeptideMap;
    }

-    private static Map<Integer, Integer> makeVertexToPeptideMap(Map<Integer, Integer> peptides, Integer startValue) {
+    //Remove edges for pairs where overlap size is significantly lower than the well occupancy
+    private static void filterByOverlapSize(SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph,
+                                            Map<Integer, Integer> alphaWellCounts,
+                                            Map<Integer, Integer> betaWellCounts,
+                                            Map<Integer, Integer> plateVtoAMap,
+                                            Map<Integer, Integer> plateVtoBMap,
+                                            Integer minOverlapPercent) {
+        for (DefaultWeightedEdge e : graph.edgeSet()) {
+            Integer alphaOcc = alphaWellCounts.get(plateVtoAMap.get(graph.getEdgeSource(e)));
+            Integer betaOcc = betaWellCounts.get(plateVtoBMap.get(graph.getEdgeTarget(e)));
+            double weight = graph.getEdgeWeight(e);
+            double min = minOverlapPercent / 100.0;
+            if ((weight / alphaOcc < min) || (weight / betaOcc < min)) {
+                graph.setEdgeWeight(e, 0.0);
+            }
+        }
+    }
+
+    private static Map<Integer, Integer> makeSequenceToSequenceMap(List<Integer[]> cells, int keySequenceIndex,
+                                                                   int valueSequenceIndex){
+        Map<Integer, Integer> keySequenceToValueSequenceMap = new HashMap<>();
+        for (Integer[] cell : cells) {
+           keySequenceToValueSequenceMap.put(cell[keySequenceIndex], cell[valueSequenceIndex]);
+        }
+        return keySequenceToValueSequenceMap;
+    }
+
+    private static Map<Integer, Integer> makeVertexToSequenceMap(Map<Integer, Integer> sequences, Integer startValue) {
        Map<Integer, Integer> map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry
        Integer index = startValue;
-        for (Integer k: peptides.keySet()) {
+        for (Integer k: sequences.keySet()) {
            map.put(index, k);
            index++;
        }
@@ -693,9 +704,4 @@ public class Simulator {
        return inverse;
    }

-    private static int bigDecimalComparator(BigDecimal bd, Integer i) {
-        return bd.compareTo(BigDecimal.valueOf(i));
-    }
-
-
 }
--- a/src/main/java/UserInterface.java
+++ b/src/main/java/UserInterface.java
--- a/src/main/java/Vertex.java
+++ b/src/main/java/Vertex.java
@@ -0,0 +1,17 @@
+public class Vertex {
+    private final Integer peptide;
+    private final Integer occupancy;
+
+    public Vertex(Integer peptide, Integer occupancy) {
+        this.peptide = peptide;
+        this.occupancy = occupancy;
+    }
+
+    public Integer getPeptide() {
+        return peptide;
+    }
+
+    public Integer getOccupancy() {
+        return occupancy;
+    }
+}
Author	SHA1	Message	Date
efischer	4b053e6ec4	Remove artifacts from tracking to stop repo bloat.	2022-02-22 16:14:50 -06:00
efischer	44784b7976	Remove artifacts from tracking to stop repo bloat.	2022-02-22 16:10:22 -06:00
efischer	7c19896dc9	update readme	2022-02-22 16:09:50 -06:00
efischer	aec7e3016f	Typos in documentation	2022-02-21 11:19:54 -06:00
efischer	5c75c1ac09	Update readme.md	2022-02-21 06:53:30 +00:00
efischer	cb1f7adece	Change "peptide" references in code to "sequence", adding comments	2022-02-21 00:29:34 -06:00
efischer	370de79546	Add performance section to readme	2022-02-21 00:02:49 -06:00
efischer	a803336f56	Add performance section to readme	2022-02-21 00:01:20 -06:00
efischer	94b54b3416	Add performance section to readme	2022-02-20 23:31:25 -06:00
efischer	601e141fd0	Update readme	2022-02-20 22:51:49 -06:00
efischer	8f9c6b7d33	Update readme TODO	2022-02-20 20:59:05 -06:00
efischer	e5ddc73723	Finish reverting back to wells-as-rows	2022-02-20 20:54:44 -06:00
efischer	9b18fac74f	Invoke garbage collection	2022-02-20 20:47:12 -06:00
efischer	63ef6aa7a0	Revert attempt to switch plate output format. It worked, but introduced a bug in graph filtering I don't want to chase down	2022-02-20 20:45:35 -06:00
efischer	7558455f39	Correct errors in output and documentation	2022-02-20 20:13:38 -06:00
efischer	410f0ae547	Remove testing code, add garbage collection calls	2022-02-20 20:06:45 -06:00
efischer	1bc6a11545	Change plate reader/writer to use columns as wells	2022-02-20 19:58:24 -06:00
efischer	2b13e10e95	Change plate reader/writer to use columns as wells	2022-02-20 19:48:09 -06:00
efischer	4fd5baeb98	Change plate reader/writer to use columns as wells	2022-02-20 19:41:06 -06:00
efischer	b2a4e9a42b	Change plate reader/writer to use columns as wells	2022-02-20 19:17:56 -06:00
efischer	d1bb49b482	Change plate reader/writer to use columns as wells	2022-02-20 19:12:11 -06:00
efischer	9adb7dffb8	Change plate reader/writer to use columns as wells	2022-02-20 19:08:04 -06:00
efischer	2023bb9d7e	Cleanup file output, add UI verbosity	2022-02-20 18:31:31 -06:00
efischer	405fbf17ff	improve documentation	2022-02-20 17:11:39 -06:00
efischer	24519f4a52	improve documentation	2022-02-20 17:04:25 -06:00
efischer	2afd01eeef	improve documentation	2022-02-20 15:48:11 -06:00
efischer	10d0b711bf	improve documentation	2022-02-20 15:38:40 -06:00
efischer	8f98baf44e	improve documentation	2022-02-20 15:37:39 -06:00
efischer	d6c7c40c96	improve documentation	2022-02-20 13:23:15 -06:00
efischer	61c14b2ecf	improve documentation	2022-02-20 13:20:47 -06:00
efischer	22fc4aedfe	improve documentation	2022-02-20 13:18:49 -06:00
efischer	5d24dc6f70	improve documentation	2022-02-20 13:15:32 -06:00
efischer	2c01a0211c	move readme	2022-02-20 12:02:27 -06:00
efischer	f2b5d9e1b7	Rename and update readme	2022-02-20 11:58:12 -06:00
efischer	74c8cafd81	scan for filename	2022-02-20 03:08:31 -06:00
efischer	d1c37b5ccd	Relocate overlap threshold filters	2022-02-20 03:05:56 -06:00
efischer	cb2c5a6024	Add plate well concentrations to output data	2022-02-20 02:29:42 -06:00
efischer	284a5b3a40	Add plate well concentrations to output data	2022-02-20 02:23:31 -06:00
efischer	52afb1edc2	Add plate well concentrations to output data	2022-02-20 02:17:36 -06:00
efischer	9c52bc878a	Add plate well concentrations to output data	2022-02-20 02:13:13 -06:00
efischer	248fe4d662	Add plate well concentrations to output data	2022-02-20 02:09:22 -06:00
efischer	5d0e60708c	Add plate well concentrations to output data	2022-02-20 01:53:34 -06:00
efischer	c96b7237e9	Add plate well concentrations to output data	2022-02-20 01:40:01 -06:00
efischer	0b28259800	Add plate well concentrations to output data	2022-02-20 01:13:22 -06:00
efischer	837ef7bfe4	UI cleanup, some code cleanup	2022-02-20 01:05:28 -06:00
efischer	0bebbc7602	Add missing filtering code	2022-02-19 22:56:38 -06:00
efischer	84f7ddb696	Fix interactive output	2022-02-19 22:49:50 -06:00
efischer	c4633da9eb	Correct propogation of peptide counts	2022-02-19 22:33:38 -06:00
efischer	5b2ed165d0	Clean up interactive text, bugfix	2022-02-19 22:21:09 -06:00
efischer	0026d8cdfe	Use buffered input/output streams	2022-02-19 22:04:41 -06:00
efischer	13fb7168bf	Refactor to read/write files of graph and map data	2022-02-19 21:46:01 -06:00
efischer	568a6be3c7	Refactoring to allow graphs from file	2022-02-19 17:23:55 -06:00
efischer	cfa473c7ce	Adding parameters to filter by occupancy difference and percent overlap	2022-02-19 14:06:11 -06:00
efischer	6faacd9a82	Adding parameters to filter by occupancy difference and percent overlap	2022-02-19 14:05:26 -06:00
efischer	ce88e170c1	Update readme with max memory flag	2022-02-18 17:48:25 -06:00
efischer	47e23addfa	Do new filtering before matching	2022-02-18 17:42:05 -06:00
efischer	b9ee31b64c	Do new filtering before matching	2022-02-18 17:28:24 -06:00