Remove artifacts from tracking to stop repo bloat.

update readme
2022-02-22 16:14:50 -06:00 · 2022-02-22 16:10:22 -06:00 · 2022-02-22 16:09:50 -06:00 · 2022-02-21 11:19:54 -06:00 · 2022-02-21 06:53:30 +00:00 · 2022-02-21 00:29:34 -06:00
18 changed files with 1293 additions and 642 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 /out/
--- a/.idea/artifacts/BiGpairSEQ_Sim_jar.xml
+++ b/.idea/artifacts/BiGpairSEQ_Sim_jar.xml
@@ -1,11 +1,11 @@
 <component name="ArtifactManager">
-  <artifact type="jar" name="TCellSim:jar">
+  <artifact type="jar" build-on-make="true" name="BiGpairSEQ_Sim:jar">
-    <output-path>$PROJECT_DIR$/out/artifacts/TCellSim_jar</output-path>
+    <output-path>$PROJECT_DIR$/out/artifacts/BiGpairSEQ_Sim_jar</output-path>
-    <root id="archive" name="TCellSim.jar">
+    <root id="archive" name="BiGpairSEQ_Sim.jar">
      <element id="directory" name="META-INF">
        <element id="file-copy" path="$PROJECT_DIR$/src/main/java/META-INF/MANIFEST.MF" />
      </element>
-      <element id="module-output" name="TCellSim" />
+      <element id="module-output" name="BigPairSEQ" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-core/1.5.1/jgrapht-core-1.5.1.jar" path-in-jar="/" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/org/jheaps/jheaps/0.13/jheaps-0.13.jar" path-in-jar="/" />
      <element id="extracted-dir" path="$MAVEN_REPOSITORY$/commons-cli/commons-cli/1.5.0/commons-cli-1.5.0.jar" path-in-jar="/" />
--- a/.idea/compiler.xml
+++ b/.idea/compiler.xml
@@ -6,7 +6,7 @@
        <sourceOutputDir name="target/generated-sources/annotations" />
        <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
        <outputRelativeToContentRoot value="true" />
-        <module name="TCellSim" />
+        <module name="BigPairSEQ" />
      </profile>
    </annotationProcessing>
  </component>
--- a/.idea/libraries/jgrapht_io.xml
+++ b/.idea/libraries/jgrapht_io.xml
@@ -0,0 +1,15 @@
 <component name="libraryTable">
  <library name="jgrapht.io" type="repository">
    <properties maven-id="org.jgrapht:jgrapht-io:1.5.1" />
    <CLASSES>
      <root url="jar://$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-io/1.5.1/jgrapht-io-1.5.1.jar!/" />
      <root url="jar://$MAVEN_REPOSITORY$/org/jgrapht/jgrapht-core/1.5.1/jgrapht-core-1.5.1.jar!/" />
      <root url="jar://$MAVEN_REPOSITORY$/org/jheaps/jheaps/0.13/jheaps-0.13.jar!/" />
      <root url="jar://$MAVEN_REPOSITORY$/org/antlr/antlr4-runtime/4.8-1/antlr4-runtime-4.8-1.jar!/" />
      <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-text/1.8/commons-text-1.8.jar!/" />
      <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-lang3/3.9/commons-lang3-3.9.jar!/" />
    </CLASSES>
    <JAVADOC />
    <SOURCES />
  </library>
 </component>
--- a/out/artifacts/TCellSim_jar/TCellSim.jar
+++ b/out/artifacts/TCellSim_jar/TCellSim.jar
--- a/out/artifacts/TCellSim_jar/readme.md
+++ b/out/artifacts/TCellSim_jar/readme.md
@@ -1,4 +0,0 @@
 Executable .jar file with interactive, command line UI
 Usage: java -jar TCellSim.jar
 Requires Java11 or higher (Openjdk-17 recommended)
--- a/readme.md
+++ b/readme.md
@@ -0,0 +1,278 @@
 # BiGpairSEQ SIMULATOR
 ## ABOUT
 This program simulates BiGpairSEQ (Bipartite Graph pairSEQ), a graph theory-based adaptation
 of the pairSEQ algorithm (Howie, et al. 2015) for pairing T cell receptor sequences.
 ## THEORY
 Unlike pairSEQ, which calculates p-values for every TCR alpha/beta overlap and compares
 against a null distribution, BiGpairSEQ does not do any statistical calculations
 directly.
 BiGpairSEQ creates a [simple bipartite weighted graph](https://en.wikipedia.org/wiki/Bipartite_graph) representing the sample plate.
 The distinct TCRA and TCRB sequences form the two sets of vertices. Every TCRA/TCRB pair that share a well
 are connected by an edge, with the edge weight set to the number of wells in which both sequences appear.
 (Sequences present in *all* wells are filtered out prior to creating the graph, as there is no signal in their occupancy pattern.)
 The problem of pairing TCRA/TCRB sequences thus reduces to the "assignment problem" of finding a maximum weight
 matching on a bipartite graph--the subset of vertex-disjoint edges whose weights sum to the maximum possible value.
 This is a well-studied combinatorial optimization problem, with many known solutions.
 The most efficient algorithm known to the author for maximum weight matching of a bipartite graph with strictly integral weights
 is from Duan and Su (2012). For a graph with m edges, n vertices per side, and maximum integer edge weight N, 
 their algorithm runs in **O(m sqrt(n) log(N))** time. As the graph representation of a pairSEQ experiment is 
 bipartite with integer weights, this algorithm is ideal for BiGpairSEQ.
 Unfortunately, it's a fairly new algorithm, and not yet implemented by the graph theory library used in this simulator.
 So this program instead uses the Fibonacci heap-based algorithm of Fredman and Tarjan (1987), which has a worst-case
 runtime of **O(n (n log(n) + m))**. The algorithm is implemented as described in Melhorn and Näher (1999).
 The current version of the program uses a pairing heap instead of a Fibonacci heap for its priority queue,
 which has lower theoretical efficiency but also lower complexity overhead, and is often equivalently performant
 in practice.
 ## USAGE
 ### RUNNING THE PROGRAM
 BiGpairSEQ_Sim is an executable .jar file. Requires Java 11 or higher. [OpenJDK 17](https://jdk.java.net/17/)
 recommended.
 Run with the command:
 `java -jar BiGpairSEQ_Sim.jar`
 Processing sample plates with tens of thousands of sequences may require large amounts 
 of RAM. It is often desirable to increase the JVM maximum heap allocation with the -Xmx flag.
 For example, to run the program with 32 gigabytes of memory, use the command:
 `java -Xmx32G -jar BiGpairSEQ_Sim.jar`
 Once running, BiGpairSEQ_Sim has an interactive, menu-driven CLI for generating files and simulating TCR pairing. The
 main menu looks like this:
 ```
 --------BiGPairSEQ SIMULATOR--------
 ALPHA/BETA T CELL RECEPTOR MATCHING
  USING WEIGHTED BIPARTITE GRAPHS
 ------------------------------------
 Please select an option:
 1) Generate a population of distinct cells
 2) Generate a sample plate of T cells
 3) Generate CDR3 alpha/beta occupancy data and overlap graph
 4) Simulate bipartite graph CDR3 alpha/beta matching (BiGpairSEQ)
 9) About/Acknowledgments
 0) Exit
 ```
 ### OUTPUT
 To run the simulation, the program reads and writes 4 kinds of files:
 * Cell Sample files in CSV format
 * Sample Plate files in CSV format
 * Graph and Data files in binary object serialization format
 * Matching Results files in CSV format
 When entering filenames, it is not necessary to include the file extension (.csv or .ser). When reading or
 writing files, the program will automatically add the correct extension to any filename without one.
 #### Cell Sample Files
 Cell Sample files consist of any number of distinct "T cells." Every cell contains 
 four sequences: Alpha CDR3, Beta CDR3, Alpha CDR1, Beta CDR1. The sequences are represented by
 random integers. CDR3 Alpha and Beta sequences are all unique within a given Cell Sample file. CDR1 Alpha and Beta sequences
 are not necessarily unique; the relative diversity can be set when making the file.
 (Note: though cells still have CDR1 sequences, matching of CDR1s is currently awaiting re-implementation.)
 Options when making a Cell Sample file:
 * Number of T cells to generate
 * Factor by which CDR3s are more diverse than CDR1s
 Files are in CSV format. Rows are distinct T cells, columns are sequences within the cells.
 Comments are preceded by `#`
 Structure:
 ---
    # Sample contains 1 unique CDR1 for every 4 unique CDR3s.
 | Alpha CDR3 | Beta CDR3 | Alpha CDR1 | Beta CDR1 |
 |---|---|---|---|
 |unique number|unique number|number|number|
 ---
 #### Sample Plate Files
 Sample Plate files consist of any number of "wells" containing any number of T cells (as 
 described above). The wells are filled randomly from a Cell Sample file, according to a selected
 frequency distribution. Additionally, every individual sequence within each cell may, with some
 given dropout probability, be omitted from the file; this simulates the effect of amplification errors
 prior to sequencing. Plates can also be partitioned into any number of sections, each of which can have a 
 different concentration of T cells per well.
 Options when making a Sample Plate file:
 * Cell Sample file to use
 * Statistical distribution to apply to Cell Sample file
  * Poisson
  * Gaussian
    * Standard deviation size 
  * Exponential
    * Lambda value
      * (Based on the slope of the graph in Figure 4C of the pairSEQ paper, the distribution of the original experiment was exponential with a lambda of approximately 0.6. (Howie, et al. 2015))
 * Total number of wells on the plate
 * Number of sections on plate
 * Number of T cells per well
  * per section, if more than one section
 * Dropout rate
 Files are in CSV format. There are no header labels. Every row represents a well. 
 Every column represents an individual cell, containing four sequences, depicted as an array string:
 `[CDR3A, CDR3B, CDR1A, CDR1B]`. So a representative cell might look like this: 
 `[525902, 791533, -1, 866282]`
 Notice that the CDR1 Alpha is missing in the cell above--sequence dropout from simulated amplification error.
 Dropout sequences are replaced with the value `-1`. Comments are preceded by `#`
 Structure:
 ---
 ```
 # Cell source file name:
 # Each row represents one well on the plate
 # Plate size:
 # Concentrations:
 # Lambda -or- StdDev: 
 ```
 | Well 1, cell 1 | Well 1, cell 2 | Well 1, cell 3| ... |
 |---|---|---|---|
 | **Well 2, cell 1** | **Well 2, cell 2** | **Well 2, cell 3**| **...** |
 | **Well 3, cell 1** | **Well 3, cell 2** | **Well 3, cell 3**| **...** |
 | **...** | **...** | **...** | **...** |
 ---
 #### Graph and Data Files
 Graph and Data files are serialized binaries of a Java object containing the weigthed bipartite graph representation of a
 Sample Plate, along with the necessary metadata for matching and results output. Making them requires a Cell Sample file 
 (to construct a list of correct sequence pairs for checking the accuracy of BiGpairSEQ simulations) and a 
 Sample Plate file (to construct the associated occupancy graph). These files can be several gigabytes in size.
 Writing them to a file lets us generate a graph and its metadata once, then use it for multiple different BiGpairSEQ simulations.
 Options for creating a Graph and Data file:
 * The Cell Sample file to use
 * The Sample Plate file to use. (This must have been generated from the selected Cell Sample file.)
 These files do not have a human-readable structure, and are not portable to other programs. (Export of graphs in a 
 portable data format may be implemented in the future. The tricky part is encoding the necessary metadata.)
 ---
 #### Matching Results Files 
 Matching results files consist of the results of a BiGpairSEQ matching simulation.
 Files are in CSV format. Rows are sequence pairings with extra relevant data. Columns are pairing-specific details.
 Metadata about the matching simulation is included as comments. Comments are preceded by `#`.
 Options when running a BiGpairSEQ simulation of CDR3 alpha/beta matching:
 * The minimum number of alpha/beta overlap wells to attempt to match
  * (must be >= 1)
 * The maximum number of alpha/beta overlap wells to attempt to match
  * (must be <= the number of wells on the plate - 1)
 * The maximum difference in alpha/beta occupancy to attempt to match
  * (Optional. To skip using this filter, enter a value >= the number of wells on the plate)
 * The minimum overlap percentage--the percentage of a sequence's occupied wells shared by another sequence--to attempt to match. Given as value in range 0 - 100.
  * (Optional. To skip using this filter, enter 0)
 Example output:
 ---
 ```
 # Source Sample Plate file: 4MilCellsPlate.csv
 # Source Graph and Data file: 4MilCellsPlateGraph.ser
 # T cell counts in sample plate wells: 30000
 # Total alphas found: 11813
 # Total betas found: 11808
 # High overlap threshold: 94
 # Low overlap threshold: 3
 # Minimum overlap percent: 0
 # Maximum occupancy difference: 96
 # Pairing attempt rate: 0.438
 # Correct pairings: 5151
 # Incorrect pairings: 18
 # Pairing error rate: 0.00348
 # Simulation time: 862 seconds
 ```
 | Alpha | Alpha well count | Beta | Beta well count | Overlap count | Matched Correctly? | P-value |
 |---|---|---|---|---|---|---|
 |5242972|17|1571520|18|17|true|1.41E-18|
 |5161027|18|2072219|18|18|true|7.31E-20|
 |4145198|33|1064455|30|29|true|2.65E-21|
 |7700582|18|112748|18|18|true|7.31E-20|
 |...|...|...|...|...|...|...|
 ---
 **NOTE: The p-values in the output are not used for matching**—they aren't part of the BiGpairSEQ algorithm at all. 
 P-values are calculated *after* BiGpairSEQ matching is completed, for purposes of comparison only, 
 using the (2021 corrected) formula from the original pairSEQ paper. (Howie, et al. 2015)
 ### PERFORMANCE
 Performance details of the example excerpted above:
 On a home computer with a Ryzen 5600X CPU, 64GB of 3200MHz DDR4 RAM (half of which was allocated to the Java Virtual Machine), and a PCIe 3.0 SSD, running Linux Mint 20.3 Edge (5.13 kernel), 
 the author ran a BiGpairSEQ simulation of a 96-well sample plate with 30,000 T cells/well comprising ~11,800 alphas and betas,
 taken from a sample of 4,000,000 distinct cells with an exponential frequency distribution.
 With min/max occupancy threshold of 3 and 94 wells for matching, and no other pre-filtering, BiGpairSEQ identified 5,151 
 correct pairings and 18 incorrect pairings, for an accuracy of 99.652%.
 The simulation time was 14'22". If intermediate results were held in memory, this would be equivalent to the total elapsed time.
 Since this implementation of BiGpairSEQ writes intermediate results to disk (to improve the efficiency of *repeated* simulations
 with different filtering options), the actual elapsed time was greater. File I/O time was not measured, but took 
 slightly less time than the simulation itself. Real elapsed time from start to finish was under 30 minutes.
 ## TODO
 * ~~Try invoking GC at end of workloads to reduce paging to disk~~ DONE
 * ~~Hold graph data in memory until another graph is read-in?~~ ABANDONED
  * *No, this won't work, because BiGpairSEQ simulations alter the underlying graph based on filtering constraints. Changes would cascade with multiple experiments.*
 * See if there's a reasonable way to reformat Sample Plate files so that wells are columns instead of rows. 
  * ~~Problem is variable number of cells in a well~~
  * ~~Apache Commons CSV library writes entries a row at a time~~ 
    * _Got this working, but at the cost of a profoundly strange bug in graph occupancy filtering. Have reverted the repo until I can figure out what caused that. Given how easily Thingiverse transposes CSV matrices in R, might not even be worth fixing._
 * Re-implement command line arguments, to enable scripting and statistical simulation studies
 * Implement sample plates with random numbers of T cells per well.
  * Possible BiGpairSEQ advantage over pairSEQ: BiGpairSEQ is resilient to variations in well population sizes on a sample plate; pairSEQ is not.
    * preliminary data suggests that BiGpairSEQ behaves roughly as though the whole plate had whatever the *average* well concentration is, but that's still speculative.
 * Enable GraphML output in addition to serialized object binaries, for data portability
  * Custom vertex type with attribute for sequence occupancy?
 * Re-implement CDR1 matching method
 * Implement Duan and Su's maximum weight matching algorithm
  * Add controllable algorithm-type parameter?
 * Test whether pairing heap (currently used) or Fibonacci heap is more efficient for priority queue in current matching algorithm
  * in theory Fibonacci heap should be more efficient, but complexity overhead may eliminate theoretical advantage
  * Add controllable heap-type parameter?
 ## CITATIONS
 * Howie, B., Sherwood, A. M., et al. ["High-throughput pairing of T cell receptor alpha and beta sequences."](https://pubmed.ncbi.nlm.nih.gov/26290413/) Sci. Transl. Med. 7, 301ra131 (2015)
 * Duan, R., Su H. ["A Scaling Algorithm for Maximum Weight Matching in Bipartite Graphs."](https://web.eecs.umich.edu/~pettie/matching/Duan-Su-scaling-bipartite-matching.pdf) Proceedings of the Twenty-Third Annual ACM-SIAM Symposium on Discrete Algorithms, p. 1413-1424. (2012)
 * Melhorn, K., Näher, St. [The LEDA Platform of Combinatorial and Geometric Computing.](https://people.mpi-inf.mpg.de/~mehlhorn/LEDAbook.html) Cambridge University Press. Chapter 7, Graph Algorithms; p. 132-162 (1999)
 * Fredman, M., Tarjan, R. ["Fibonacci heaps and their uses in improved network optimization algorithms."](https://www.cl.cam.ac.uk/teaching/1011/AlgorithII/1987-FredmanTar-fibonacci.pdf) J. ACM, 34(3):596–615 (1987))
 ## EXTERNAL LIBRARIES USED
 * [JGraphT](https://jgrapht.org) -- Graph theory data structures and algorithms
 * [JHeaps](https://www.jheaps.org) -- For pairing heap priority queue used in maximum weight matching algorithm
 * [Apache Commons CSV](https://commons.apache.org/proper/commons-csv/) -- For CSV file output
 * [Apache Commons CLI](https://commons.apache.org/proper/commons-cli/) -- To enable command line arguments for scripting. (**Awaiting re-implementation**.)
 ## ACKNOWLEDGEMENTS
 BiGpairSEQ was conceived in collaboration with Dr. Alice MacQueen, who brought the original 
 pairSEQ paper to the author's attention and explained all the biology terms he didn't know.
 ## AUTHOR
 BiGpairSEQ algorithm and simulation by Eugene Fischer, 2021. UI improvements and documentation, 2022.
--- a/src/main/java/Equations.java
+++ b/src/main/java/Equations.java
@@ -8,6 +8,9 @@ public abstract class Equations {
        return (int) ((Math.random() * (max - min)) + min);
    }
    //pValue calculation as described in original pairSEQ paper.
    //Included for comparison with original results.
    //Not used by BiGpairSEQ for matching.
    public static double pValue(Integer w, Integer w_a, Integer w_b, double w_ab_d) {
        int w_ab = (int) w_ab_d;
        double pv = 0.0;
@@ -18,6 +21,9 @@ public abstract class Equations {
        return pv;
    }
    //Implementation of the (corrected) probability equation from pairSEQ paper.
    //Included for comparison with original results.
    //Not used by BiGpairSEQ for matching.
    private static double probPairedByChance(Integer w, Integer w_a, Integer w_b, Integer w_ab){
        BigInteger numer1 = choose(w, w_ab);
        BigInteger numer2 = choose(w - w_ab, w_a - w_ab);
@@ -30,10 +36,9 @@ public abstract class Equations {
        return prob.doubleValue();
    }
-    /*
+
-     * This works because nC(k+1) = nCk * (n-k)/(k+1)
+     //This works because nC(k+1) = nCk * (n-k)/(k+1)
-     * Since nC0 = 1, can start there and generate all the rest.
+     //Since nC0 = 1, can start there and generate all the rest.
     */
     public static BigInteger choose(final int N, final int K) {
         BigInteger nCk = BigInteger.ONE;
         for (int k = 0; k < K; k++) {
--- a/src/main/java/GraphDataObjectReader.java
+++ b/src/main/java/GraphDataObjectReader.java
@@ -0,0 +1,29 @@
 import java.io.*;
 public class GraphDataObjectReader {
    private GraphWithMapData data;
    private String filename;
    public GraphDataObjectReader(String filename) throws IOException {
        if(!filename.matches(".*\\.ser")){
            filename = filename + ".ser";
        }
        this.filename = filename;
        try(//don't need to close these because of try-with-resources
            BufferedInputStream fileIn = new BufferedInputStream(new FileInputStream(filename));
                ObjectInputStream in = new ObjectInputStream(fileIn))
        {
            data = (GraphWithMapData) in.readObject();
        } catch (FileNotFoundException | ClassNotFoundException ex) {
            ex.printStackTrace();
        }
    }
    public GraphWithMapData getData() {
        return data;
    }
    public String getFilename() {
        return filename;
    }
 }
--- a/src/main/java/GraphDataObjectWriter.java
+++ b/src/main/java/GraphDataObjectWriter.java
@@ -0,0 +1,28 @@
 import java.io.BufferedOutputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.ObjectOutputStream;
 public class GraphDataObjectWriter {
    private GraphWithMapData data;
    private String filename;
    public GraphDataObjectWriter(String filename, GraphWithMapData data) {
        if(!filename.matches(".*\\.ser")){
            filename = filename + ".ser";
        }
        this.filename = filename;
        this.data = data;
    }
    public void writeDataToFile() {
        try (BufferedOutputStream bufferedOut = new BufferedOutputStream(new FileOutputStream(filename));
             ObjectOutputStream out = new ObjectOutputStream(bufferedOut);
        ){
            out.writeObject(data);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
 }
--- a/src/main/java/GraphMLFileReader.java
+++ b/src/main/java/GraphMLFileReader.java
@@ -0,0 +1,35 @@
 import org.jgrapht.graph.SimpleWeightedGraph;
 import org.jgrapht.nio.graphml.GraphMLImporter;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 public class GraphMLFileReader {
    private String filename;
    private SimpleWeightedGraph graph;
    public GraphMLFileReader(String filename, SimpleWeightedGraph graph) {
        if(!filename.matches(".*\\.graphml")){
            filename = filename + ".graphml";
        }
        this.filename = filename;
        this.graph = graph;
        try(//don't need to close reader bc of try-with-resources auto-closing
            BufferedReader reader = Files.newBufferedReader(Path.of(filename));
        ){
            GraphMLImporter<SimpleWeightedGraph, BufferedReader> importer = new GraphMLImporter<>();
            importer.importGraph(graph, reader);
        }
        catch (IOException ex) {
            System.out.println("Graph file " + filename + " not found.");
            System.err.println(ex);
        }
    }
    public SimpleWeightedGraph getGraph() { return graph; }
 }
--- a/src/main/java/GraphMLFileWriter.java
+++ b/src/main/java/GraphMLFileWriter.java
@@ -0,0 +1,35 @@
 import org.jgrapht.graph.SimpleWeightedGraph;
 import org.jgrapht.nio.dot.DOTExporter;
 import org.jgrapht.nio.graphml.GraphMLExporter;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 public class GraphMLFileWriter {
    String filename;
    SimpleWeightedGraph graph;
    public GraphMLFileWriter(String filename, SimpleWeightedGraph graph) {
        if(!filename.matches(".*\\.graphml")){
            filename = filename + ".graphml";
        }
        this.filename = filename;
        this.graph = graph;
    }
    public void writeGraphToFile() {
        try(BufferedWriter writer = Files.newBufferedWriter(Path.of(filename), StandardOpenOption.CREATE_NEW);
        ){
            GraphMLExporter<SimpleWeightedGraph, BufferedWriter> exporter = new GraphMLExporter<>();
            exporter.exportGraph(graph, writer);
        } catch(IOException ex){
            System.out.println("Could not make new file named "+filename);
            System.err.println(ex);
        }
    }
 }
--- a/src/main/java/GraphWithMapData.java
+++ b/src/main/java/GraphWithMapData.java
@@ -0,0 +1,106 @@
 import org.jgrapht.graph.SimpleWeightedGraph;
 import java.time.Duration;
 import java.util.Map;
 //Can't just write the graph, because I need the occupancy data too.
 //Makes most sense to serialize object and write that to a file.
 //Which means there's no reason to split map data and graph data up.
 public class GraphWithMapData implements java.io.Serializable {
    private String sourceFilename;
    private final SimpleWeightedGraph graph;
    private Integer numWells;
    private Integer[] wellConcentrations;
    private Integer alphaCount;
    private Integer betaCount;
    private final Map<Integer, Integer> distCellsMapAlphaKey;
    private final Map<Integer, Integer> plateVtoAMap;
    private final Map<Integer, Integer> plateVtoBMap;
    private final Map<Integer, Integer> plateAtoVMap;
    private final Map<Integer, Integer> plateBtoVMap;
    private final Map<Integer, Integer> alphaWellCounts;
    private final Map<Integer, Integer> betaWellCounts;
    private final Duration time;
    public GraphWithMapData(SimpleWeightedGraph graph, Integer numWells, Integer[] wellConcentrations,
                            Integer alphaCount, Integer betaCount,
                            Map<Integer, Integer> distCellsMapAlphaKey, Map<Integer, Integer> plateVtoAMap,
                            Map<Integer,Integer> plateVtoBMap, Map<Integer, Integer> plateAtoVMap,
                            Map<Integer, Integer> plateBtoVMap, Map<Integer, Integer> alphaWellCounts,
                            Map<Integer, Integer> betaWellCounts, Duration time) {
        this.graph = graph;
        this.numWells = numWells;
        this.wellConcentrations = wellConcentrations;
        this.alphaCount = alphaCount;
        this.betaCount = betaCount;
        this.distCellsMapAlphaKey = distCellsMapAlphaKey;
        this.plateVtoAMap = plateVtoAMap;
        this.plateVtoBMap = plateVtoBMap;
        this.plateAtoVMap = plateAtoVMap;
        this.plateBtoVMap = plateBtoVMap;
        this.alphaWellCounts = alphaWellCounts;
        this.betaWellCounts = betaWellCounts;
        this.time = time;
    }
    public SimpleWeightedGraph getGraph() {
        return graph;
    }
    public Integer getNumWells() {
        return numWells;
    }
    public Integer[] getWellConcentrations() {
        return wellConcentrations;
    }
    public Integer getAlphaCount() {
        return alphaCount;
    }
    public Integer getBetaCount() {
        return betaCount;
    }
    public Map<Integer, Integer> getDistCellsMapAlphaKey() {
        return distCellsMapAlphaKey;
    }
    public Map<Integer, Integer> getPlateVtoAMap() {
        return plateVtoAMap;
    }
    public Map<Integer, Integer> getPlateVtoBMap() {
        return plateVtoBMap;
    }
    public Map<Integer, Integer> getPlateAtoVMap() {
        return plateAtoVMap;
    }
    public Map<Integer, Integer> getPlateBtoVMap() {
        return plateBtoVMap;
    }
    public Map<Integer, Integer> getAlphaWellCounts() {
        return alphaWellCounts;
    }
    public Map<Integer, Integer> getBetaWellCounts() {
        return betaWellCounts;
    }
    public Duration getTime() {
        return time;
    }
    public void setSourceFilename(String filename) {
        this.sourceFilename = filename;
    }
    public String getSourceFilename() {
        return sourceFilename;
    }
 }
--- a/src/main/java/Plate.java
+++ b/src/main/java/Plate.java
@@ -17,18 +17,28 @@ public class Plate {
    boolean exponential = false;
-    public Plate (int size, double error, Integer[] concentrations) {
+    public Plate(int size, double error, Integer[] concentrations) {
        this.size = size;
        this.error = error;
        this.concentrations = concentrations;
        //this.stdDev = stdDev;
        wells = new ArrayList<>();
    }
-    public Plate(String sourceFileName, List<List<Integer[]>> wells){
+    public Plate(String sourceFileName, List<List<Integer[]>> wells) {
        this.sourceFile = sourceFileName;
        this.wells = wells;
        this.size = wells.size();
        List<Integer> concentrations = new ArrayList<>();
        for (List<Integer[]> w: wells) {
            if(!concentrations.contains(w.size())){
                concentrations.add(w.size());
            }
        }
        this.concentrations = new Integer[concentrations.size()];
        for (int i = 0; i < this.concentrations.length; i++) {
            this.concentrations[i] = concentrations.get(i);
        }
    }
    public void fillWellsExponential(String sourceFileName, List<Integer[]> cells, double lambda){
@@ -45,6 +55,7 @@ public class Plate {
                List<Integer[]> well = new ArrayList<>();
                for (int j = 0; j < concentrations[section]; j++) {
                    do {
                        //inverse transform sampling: for random number u in [0,1), x = log(1-u) / (-lambda)
                        m = (Math.log10((1 - rand.nextDouble()))/(-lambda)) * Math.sqrt(cells.size());
                    } while (m >= cells.size() || m < 0);
                    n = (int) Math.floor(m);
@@ -57,7 +68,7 @@ public class Plate {
                    }
                    Integer[] cellToAdd = cells.get(n).clone();
                    for(int k = 0; k < cellToAdd.length; k++){
-                        if(Math.abs(rand.nextDouble()) < error){//error applied to each peptide
+                        if(Math.abs(rand.nextDouble()) < error){//error applied to each seqeunce
                            cellToAdd[k] = -1;
                        }
                    }
@@ -87,7 +98,7 @@ public class Plate {
                    n = (int) Math.floor(m);
                    Integer[] cellToAdd = cells.get(n).clone();
                    for(int k = 0; k < cellToAdd.length; k++){
-                        if(Math.abs(rand.nextDouble()) < error){//error applied to each peptide
+                        if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
                            cellToAdd[k] = -1;
                        }
                    }
@@ -123,30 +134,30 @@ public class Plate {
        return wells;
    }
-    //returns a map of the counts of the peptide at cell index pIndex, in all wells
+    //returns a map of the counts of the sequence at cell index sIndex, in all wells
-    public Map<Integer, Integer> assayWellsPeptideP(int... pIndices){
+    public Map<Integer, Integer> assayWellsSequenceS(int... sIndices){
-        return this.assayWellsPeptideP(0, size, pIndices);
+        return this.assayWellsSequenceS(0, size, sIndices);
    }
-    //returns a map of the counts of the peptide at cell index pIndex, in a specific well
+    //returns a map of the counts of the sequence at cell index sIndex, in a specific well
-    public Map<Integer, Integer> assayWellsPeptideP(int n, int... pIndices) { return this.assayWellsPeptideP(n, n+1, pIndices);}
+    public Map<Integer, Integer> assayWellsSequenceS(int n, int... sIndices) { return this.assayWellsSequenceS(n, n+1, sIndices);}
-    //returns a map of the counts of the peptide at cell index pIndex, in a range of wells
+    //returns a map of the counts of the sequence at cell index sIndex, in a range of wells
-    public Map<Integer, Integer> assayWellsPeptideP(int start, int end, int... pIndices) {
+    public Map<Integer, Integer> assayWellsSequenceS(int start, int end, int... sIndices) {
        Map<Integer,Integer> assay = new HashMap<>();
-        for(int pIndex: pIndices){
+        for(int pIndex: sIndices){
            for(int i = start; i < end; i++){
-                countPeptides(assay, wells.get(i), pIndex);
+                countSequences(assay, wells.get(i), pIndex);
            }
        }
        return assay;
    }
-    //For the peptides at cell indices pIndices, counts number of unique peptides in the given well into the given map
+    //For the sequences at cell indices sIndices, counts number of unique sequences in the given well into the given map
-    private void countPeptides(Map<Integer, Integer> wellMap, List<Integer[]> well, int... pIndices) {
+    private void countSequences(Map<Integer, Integer> wellMap, List<Integer[]> well, int... sIndices) {
        for(Integer[] cell : well) {
-            for(int pIndex: pIndices){
+            for(int sIndex: sIndices){
-                if(cell[pIndex] != -1){
+                if(cell[sIndex] != -1){
-                    wellMap.merge(cell[pIndex], 1, (oldValue, newValue) -> oldValue + newValue);
+                    wellMap.merge(cell[sIndex], 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
        }
--- a/src/main/java/PlateFileWriter.java
+++ b/src/main/java/PlateFileWriter.java
@@ -73,6 +73,7 @@ public class PlateFileWriter {
            }
            rows.add(tmp);
        }
        //build string of well concentrations
        StringBuilder concen = new StringBuilder();
        for(Integer i: concentrations){
            concen.append(i.toString());
@@ -80,7 +81,9 @@ public class PlateFileWriter {
        }
        String concenString = concen.toString();
-        CSVFormat plateFileFormat = CSVFormat.Builder.create().setCommentMarker('#').build();
+        CSVFormat plateFileFormat = CSVFormat.Builder.create()
                .setCommentMarker('#')
                .build();
        try(BufferedWriter writer = Files.newBufferedWriter(Path.of(filename), StandardOpenOption.CREATE_NEW);
            CSVPrinter printer = new CSVPrinter(writer, plateFileFormat);
--- a/src/main/java/Simulator.java
+++ b/src/main/java/Simulator.java
@@ -1,84 +1,27 @@
 //import org.jgrapht.Graph;
 import org.jgrapht.alg.interfaces.MatchingAlgorithm;
 import org.jgrapht.alg.matching.MaximumWeightBipartiteMatching;
 import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator;
 import org.jgrapht.graph.DefaultWeightedEdge;
 import org.jgrapht.graph.SimpleWeightedGraph;
 import org.jheaps.AddressableHeap;
 import org.jheaps.tree.PairingHeap;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.math.MathContext;
 import java.text.NumberFormat;
 import java.time.Instant;
 import java.time.Duration;
 import java.util.*;
 import java.util.function.Function;
 import java.util.function.Supplier;
 import java.util.stream.IntStream;
 //NOTE: "sequence" in method and variable names refers to a peptide sequence from a simulated T cell
 public class Simulator {
-    private static int cdr3AlphaIndex = 0;
+    private static final int cdr3AlphaIndex = 0;
-    private static int cdr3BetaIndex = 1;
+    private static final int cdr3BetaIndex = 1;
-    private static int cdr1AlphaIndex = 2;
+    private static final int cdr1AlphaIndex = 2;
-    private static int cdr1BetaIndex = 3;
+    private static final int cdr1BetaIndex = 3;
    //Tested generating the cell sample file itself with a set distribution, but it wasn't working well
    //realized I'd have to change matching algos as well, so abandoned it.
 //    public static CellSample generateCellSampleExp(Integer numDistinctCells, Integer cdr1Freq){
 //        //was told I=in real T cells, CDR1s have about one third the diversity of CDR3s
 //        //this could be wrong, though. could be less diversity than that.
 //        //previous sim was only CDR3s
 //        //Integer numDistinctCells = 1000000;
 //        //int cdr1Freq = 3;
 //
 //        List<Integer> numbersCDR3 = new ArrayList<>();
 //        List<Integer> numbersCDR1 = new ArrayList<>();
 //        Integer numDistCDR3s = 2 * numDistinctCells + 1;
 //        IntStream.range(1, numDistCDR3s + 1).forEach(i -> numbersCDR3.add(i));
 //        IntStream.range(numDistCDR3s + 1, numDistCDR3s + 1 + (numDistCDR3s / cdr1Freq) + 1).forEach(i -> numbersCDR1.add(i));
 //        Collections.shuffle(numbersCDR3);
 //        Collections.shuffle(numbersCDR1);
 //
 //        //Each cell represented by 4 values
 //        //two CDR3s, and two CDR1s. First two values are CDR3s, second two are CDR1s
 //        List<Integer[]> distinctCells = new ArrayList<>();
 //        for(int i = 0; i < numbersCDR3.size() - 1; i = i + 2){
 //            Integer tmpCDR3a = numbersCDR3.get(i);
 //            Integer tmpCDR3b = numbersCDR3.get(i+1);
 //            Integer tmpCDR1a = numbersCDR1.get(i % numbersCDR1.size());
 //            Integer tmpCDR1b = numbersCDR1.get((i+1) % numbersCDR1.size());
 //            Integer[] tmp = {tmpCDR3a, tmpCDR3b, tmpCDR1a, tmpCDR1b};
 //            distinctCells.add(tmp);
 //        }
 //        List<Integer[]>sampleCells = new ArrayList<>();
 //        int count;
 //        int lastFactor = 0;
 //        int factor = 10;
 //        int index = 0;
 //        while(numDistinctCells / factor >= 1){
 //            int i = index;
 //            while(i < index + factor - lastFactor){
 //                count = 0;
 //                while(count < (numDistinctCells/(factor * 10))){
 //                    sampleCells.add(distinctCells.get(i));
 //                    count++;
 //                }
 //                i++;
 //            }
 //            index = i;
 //            lastFactor = factor;
 //            factor *=10;
 //        }
 //        System.out.println("Total cells in sample: " + sampleCells.size());
 //        System.out.println("Distinct cells in sample: " + index);
 //        return new CellSample(sampleCells, cdr1Freq);
 //    }
    public static CellSample generateCellSample(Integer numDistinctCells, Integer cdr1Freq) {
        //In real T cells, CDR1s have about one third the diversity of CDR3s
        //previous sim was only CDR3s
        List<Integer> numbersCDR3 = new ArrayList<>();
        List<Integer> numbersCDR1 = new ArrayList<>();
        Integer numDistCDR3s = 2 * numDistinctCells + 1;
@@ -88,7 +31,7 @@ public class Simulator {
        Collections.shuffle(numbersCDR1);
        //Each cell represented by 4 values
-        //two CDR3s, and two CDR1s. First two values are CDR3s, second two are CDR1s
+        //two CDR3s, and two CDR1s. First two values are CDR3s (alpha, beta), second two are CDR1s (alpha, beta)
        List<Integer[]> distinctCells = new ArrayList<>();
        for(int i = 0; i < numbersCDR3.size() - 1; i = i + 2){
            Integer tmpCDR3a = numbersCDR3.get(i);
@@ -101,63 +44,62 @@ public class Simulator {
        return new CellSample(distinctCells, cdr1Freq);
    }
-    public static MatchingResult matchCDR3s(List<Integer[]> distinctCells,
+    //Make the graph needed for matching CDR3s
-                                            Plate samplePlate, Integer lowThreshold,
+    public static GraphWithMapData makeGraph(List<Integer[]> distinctCells, Plate samplePlate, boolean verbose) {
                                            Integer highThreshold, boolean verbose){
        if(verbose){System.out.println("Cells: " + distinctCells.size());}
        Instant start = Instant.now();
        int numWells = samplePlate.getSize();
        int[] alphaIndex = {cdr3AlphaIndex};
        int[] betaIndex = {cdr3BetaIndex};
        int numWells = samplePlate.getSize();
        if(verbose){System.out.println("Making cell maps");}
        //HashMap keyed to Alphas, values Betas
-        Map<Integer, Integer> distCellsMapAlphaKey = makePeptideToPeptideMap(distinctCells, 0, 1);
+        Map<Integer, Integer> distCellsMapAlphaKey = makeSequenceToSequenceMap(distinctCells, 0, 1);
        if(verbose){System.out.println("Cell maps made");}
        if(verbose){System.out.println("Making well maps");}
-        Map<Integer, Integer> allAlphas = samplePlate.assayWellsPeptideP(alphaIndex);
+        Map<Integer, Integer> allAlphas = samplePlate.assayWellsSequenceS(alphaIndex);
-        Map<Integer, Integer> allBetas = samplePlate.assayWellsPeptideP(betaIndex);
+        Map<Integer, Integer> allBetas = samplePlate.assayWellsSequenceS(betaIndex);
        int alphaCount = allAlphas.size();
-        if(verbose){System.out.println("all alphas count: " + alphaCount);}
+        if(verbose){System.out.println("All alphas count: " + alphaCount);}
        int betaCount = allBetas.size();
-        if(verbose){System.out.println("all betas count: " + betaCount);}
+        if(verbose){System.out.println("All betas count: " + betaCount);}
        if(verbose){System.out.println("Well maps made");}
-        //Remove saturating-occupancy peptides because they have no signal value.
+        //Remove saturating-occupancy sequences because they have no signal value.
-        //Remove below-minimum-overlap-threshold peptides because they can't possibly have an overlap with another
+        //Remove sequences with total occupancy below minimum pair overlap threshold
-        //peptide that's above the threshold.
+        if(verbose){System.out.println("Removing sequences present in all wells.");}
-        if(verbose){System.out.println("Removing peptides present in all wells.");}
+        //if(verbose){System.out.println("Removing sequences with occupancy below the minimum overlap threshold");}
-        if(verbose){System.out.println("Removing peptides with occupancy below the minimum overlap threshold");}
+        filterByOccupancyThreshold(allAlphas, 1, numWells - 1);
-        filterByOccupancyThreshold(allAlphas, lowThreshold, numWells - 1);
+        filterByOccupancyThreshold(allBetas, 1, numWells - 1);
-        filterByOccupancyThreshold(allBetas, lowThreshold, numWells - 1);
+        if(verbose){System.out.println("Sequences removed");}
        if(verbose){System.out.println("Peptides removed");}
        int pairableAlphaCount = allAlphas.size();
-        if(verbose){System.out.println("Remaining alpha count: " + pairableAlphaCount);}
+        if(verbose){System.out.println("Remaining alphas count: " + pairableAlphaCount);}
        int pairableBetaCount = allBetas.size();
-        if(verbose){System.out.println("Remaining beta count: " + pairableBetaCount);}
+        if(verbose){System.out.println("Remaining betas count: " + pairableBetaCount);}
        if(verbose){System.out.println("Making vertex maps");}
        //For the SimpleWeightedBipartiteGraphMatrixGenerator, all vertices must have
-        // distinct numbers associated with them. Since I'm using a 2D array, that means
+        //distinct numbers associated with them. Since I'm using a 2D array, that means
-        // distinct indices between the rows and columns. vertexStartValue lets me track where I switch
+        //distinct indices between the rows and columns. vertexStartValue lets me track where I switch
-        // from numbering rows to columns, so I can assign unique numbers to every vertex, and then
+        //from numbering rows to columns, so I can assign unique numbers to every vertex, and then
-        // subtract the vertexStartValue from betas to use their vertex labels as array indices
+        //subtract the vertexStartValue from betas to use their vertex labels as array indices
        Integer vertexStartValue = 0;
        //keys are sequential integer vertices, values are alphas
-        Map<Integer, Integer> plateVtoAMap = makeVertexToPeptideMap(allAlphas, vertexStartValue);
+        Map<Integer, Integer> plateVtoAMap = makeVertexToSequenceMap(allAlphas, vertexStartValue);
-        //New start value for vertex to beta map should be one more than final vertex value in alpha map
+        //new start value for vertex to beta map should be one more than final vertex value in alpha map
        vertexStartValue += plateVtoAMap.size();
        //keys are sequential integers vertices, values are betas
-        Map<Integer, Integer> plateVtoBMap = makeVertexToPeptideMap(allBetas, vertexStartValue);
+        Map<Integer, Integer> plateVtoBMap = makeVertexToSequenceMap(allBetas, vertexStartValue);
        //keys are alphas, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateAtoVMap = invertVertexMap(plateVtoAMap);
        //keys are betas, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateBtoVMap = invertVertexMap(plateVtoBMap);
        if(verbose){System.out.println("Vertex maps made");}
        //make adjacency matrix for bipartite graph generator
        //(technically this is only 1/4 of an adjacency matrix, but that's all you need
        //for a bipartite graph, and all the SimpleWeightedBipartiteGraphMatrixGenerator class expects.)
        if(verbose){System.out.println("Creating adjacency matrix");}
        //Count how many wells each alpha appears in
        Map<Integer, Integer> alphaWellCounts = new HashMap<>();
@@ -165,54 +107,78 @@ public class Simulator {
        Map<Integer, Integer> betaWellCounts = new HashMap<>();
        //the adjacency matrix to be used by the graph generator
        double[][] weights = new double[plateVtoAMap.size()][plateVtoBMap.size()];
-        countPeptidesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap,
+        countSequencesAndFillMatrix(samplePlate, allAlphas, allBetas, plateAtoVMap,
                plateBtoVMap, alphaIndex, betaIndex, alphaWellCounts, betaWellCounts, weights);
-        if(verbose){System.out.println("matrix created");}
+        if(verbose){System.out.println("Matrix created");}
-        /**
+        //create bipartite graph
-         * This is where the bipartite graph is created
+        if(verbose){System.out.println("Creating graph");}
         */
        if(verbose){System.out.println("creating graph");}
        //the graph object
        SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph =
                new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
        //the graph generator
        SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
        //the list of alpha vertices
-        List<Integer> alphaVertices = new ArrayList<>();
+        List<Integer> alphaVertices = new ArrayList<>(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
        alphaVertices.addAll(plateVtoAMap.keySet()); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.first(alphaVertices);
        //the list of beta vertices
-        List<Integer> betaVertices = new ArrayList<>();
+        List<Integer> betaVertices = new ArrayList<>(plateVtoBMap.keySet());
        betaVertices.addAll(plateVtoBMap.keySet());
        graphGenerator.second(betaVertices); //This will work because LinkedHashMap preserves order of entry
        //use adjacency matrix of weight created previously
        graphGenerator.weights(weights);
        graphGenerator.generateGraph(graph);
        if(verbose){System.out.println("Graph created");}
-        if(verbose){System.out.println("Eliminating edges with weights outside threshold values");}
+        Instant stop = Instant.now();
        Duration time = Duration.between(start, stop);
        //return GraphWithMapData object
        return new GraphWithMapData(graph, numWells, samplePlate.getConcentrations(), alphaCount, betaCount,
                distCellsMapAlphaKey, plateVtoAMap, plateVtoBMap, plateAtoVMap,
                plateBtoVMap, alphaWellCounts, betaWellCounts, time);
    }
    //match CDR3s.
    public static MatchingResult matchCDR3s(GraphWithMapData data, String dataFilename, Integer lowThreshold,
                                            Integer highThreshold, Integer maxOccupancyDifference,
                                            Integer minOverlapPercent, boolean verbose) {
        Instant start = Instant.now();
        int numWells = data.getNumWells();
        Integer alphaCount = data.getAlphaCount();
        Integer betaCount = data.getBetaCount();
        Map<Integer, Integer> distCellsMapAlphaKey = data.getDistCellsMapAlphaKey();
        Map<Integer, Integer> plateVtoAMap = data.getPlateVtoAMap();
        Map<Integer, Integer> plateVtoBMap = data.getPlateVtoBMap();
        Map<Integer, Integer> alphaWellCounts = data.getAlphaWellCounts();
        Map<Integer, Integer> betaWellCounts = data.getBetaWellCounts();
        SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph = data.getGraph();
        //remove weights outside given overlap thresholds
        if(verbose){System.out.println("Eliminating edges with weights outside overlap threshold values");}
        filterByOccupancyThreshold(graph, lowThreshold, highThreshold);
        if(verbose){System.out.println("Over- and under-weight edges set to 0.0");}
        //Filter by overlap size
        if(verbose){System.out.println("Eliminating edges with weights less than " + minOverlapPercent.toString() +
                " percent of vertex occupancy value.");}
        filterByOverlapSize(graph, alphaWellCounts, betaWellCounts, plateVtoAMap, plateVtoBMap, minOverlapPercent);
        if(verbose){System.out.println("Edges with weights too far below vertex occupancy values set to 0.0");}
-        /**
+        //Filter by relative occupancy
-         * This is where the maximum weighted matching is found
+        if(verbose){System.out.println("Eliminating edges between vertices with occupancy difference > "
-         */
+                + maxOccupancyDifference);}
-//        if(verbose){System.out.println("Finding maximum weighted matching");}
+        filterByRelativeOccupancy(graph, alphaWellCounts, betaWellCounts, plateVtoAMap, plateVtoBMap,
-//        MaximumWeightBipartiteMatching maxWeightMatching =
+                maxOccupancyDifference);
-//                new MaximumWeightBipartiteMatching(graph, plateVtoAMap.keySet(), plateVtoBMap.keySet());
+        if(verbose){System.out.println("Edges between vertices of with excessively different occupancy values " +
-//        MatchingAlgorithm.Matching<String, DefaultWeightedEdge> graphMatching = maxWeightMatching.getMatching();
+                "set to 0.0");}
-//        if(verbose){System.out.println("Matching completed");}
+        //Find Maximum Weighted Matching
-//        Instant stop = Instant.now();
+        //using jheaps library class PairingHeap for improved efficiency
        //trying with jheaps now
        if(verbose){System.out.println("Finding maximum weighted matching");}
        //Attempting to use addressable heap to improve performance
        MaximumWeightBipartiteMatching maxWeightMatching =
                new MaximumWeightBipartiteMatching(graph,
                        plateVtoAMap.keySet(),
                        plateVtoBMap.keySet(),
-                        (i) -> new PairingHeap(Comparator.naturalOrder()));
+                        i -> new PairingHeap(Comparator.naturalOrder()));
        MatchingAlgorithm.Matching<String, DefaultWeightedEdge> graphMatching = maxWeightMatching.getMatching();
        if(verbose){System.out.println("Matching completed");}
        Instant stop = Instant.now();
@@ -227,16 +193,15 @@ public class Simulator {
        header.add("Matched correctly?");
        header.add("P-value");
        //Results for csv file
        List<List<String>> allResults = new ArrayList<>();
        NumberFormat nf = NumberFormat.getInstance(Locale.US);
        MathContext mc = new MathContext(3);
        Iterator<DefaultWeightedEdge> weightIter = graphMatching.iterator();
-        DefaultWeightedEdge e = null;
+        DefaultWeightedEdge e;
        int trueCount = 0;
        int falseCount = 0;
-        boolean check = false;
+        boolean check;
        Map<Integer, Integer> matchMap = new HashMap<>();
        while(weightIter.hasNext()) {
            e = weightIter.next();
@@ -269,31 +234,44 @@ public class Simulator {
        }
        //Metadate comments for CSV file
-        int min = alphaCount > betaCount ? betaCount : alphaCount;
+        int min = Math.min(alphaCount, betaCount);
        double attemptRate = (double) (trueCount + falseCount) / min;
        BigDecimal attemptRateTrunc = new BigDecimal(attemptRate, mc);
        double pairingErrorRate = (double) falseCount / (trueCount + falseCount);
        BigDecimal pairingErrorRateTrunc = new BigDecimal(pairingErrorRate, mc);
        //get list of well concentrations
        List<Integer> wellConcentrations = Arrays.asList(data.getWellConcentrations());
        //make string out of concentrations list
        StringBuilder concentrationStringBuilder = new StringBuilder();
        for(Integer i: wellConcentrations){
            concentrationStringBuilder.append(i.toString());
            concentrationStringBuilder.append(" ");
        }
        String concentrationString = concentrationStringBuilder.toString();
        List<String> comments = new ArrayList<>();
        comments.add("Source Sample Plate filename: " + data.getSourceFilename());
        comments.add("Source Graph and Data filename: " + dataFilename);
        comments.add("T cell counts in sample plate wells: " + concentrationString);
        comments.add("Total alphas found: " + alphaCount);
        comments.add("Total betas found: " + betaCount);
        comments.add("High overlap threshold: " + highThreshold);
        comments.add("Low overlap threshold: " + lowThreshold);
        comments.add("Minimum overlap percent: " + minOverlapPercent);
        comments.add("Maximum occupancy difference: " + maxOccupancyDifference);
        comments.add("Pairing attempt rate: " + attemptRateTrunc);
        comments.add("Correct pairings: " + trueCount);
        comments.add("Incorrect pairings: " + falseCount);
        comments.add("Pairing error rate: " + pairingErrorRateTrunc);
        Duration time = Duration.between(start, stop);
        time = time.plus(data.getTime());
        comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
        if(verbose){
            for(String s: comments){
                System.out.println(s);
            }
        }
-
+        return new MatchingResult(data.getSourceFilename(), comments, header, allResults, matchMap, time);
        return new MatchingResult(samplePlate.getSourceFileName(), comments, header, allResults, matchMap, time);
    }
    //Simulated matching of CDR1s to CDR3s. Requires MatchingResult from prior run of matchCDR3s.
@@ -313,13 +291,13 @@ public class Simulator {
        System.out.println("Previous match maps made");
        System.out.println("Making cell maps");
-        Map<Integer, Integer> alphaCDR3toCDR1Map = makePeptideToPeptideMap(distinctCells, cdr3AlphaIndex, cdr1AlphaIndex);
+        Map<Integer, Integer> alphaCDR3toCDR1Map = makeSequenceToSequenceMap(distinctCells, cdr3AlphaIndex, cdr1AlphaIndex);
-        Map<Integer, Integer> betaCDR3toCDR1Map = makePeptideToPeptideMap(distinctCells, cdr3BetaIndex, cdr1BetaIndex);
+        Map<Integer, Integer> betaCDR3toCDR1Map = makeSequenceToSequenceMap(distinctCells, cdr3BetaIndex, cdr1BetaIndex);
        System.out.println("Cell maps made");
        System.out.println("Making well maps");
-        Map<Integer, Integer> allCDR3s = samplePlate.assayWellsPeptideP(cdr3Indices);
+        Map<Integer, Integer> allCDR3s = samplePlate.assayWellsSequenceS(cdr3Indices);
-        Map<Integer, Integer> allCDR1s = samplePlate.assayWellsPeptideP(cdr1Indices);
+        Map<Integer, Integer> allCDR1s = samplePlate.assayWellsSequenceS(cdr1Indices);
        int CDR3Count = allCDR3s.size();
        System.out.println("all CDR3s count: " + CDR3Count);
        int CDR1Count = allCDR1s.size();
@@ -353,11 +331,11 @@ public class Simulator {
        // subtract the vertexStartValue from CDR1s to use their vertex labels as array indices
        Integer vertexStartValue = 0;
        //keys are sequential integer vertices, values are CDR3s
-        Map<Integer, Integer> plateVtoCDR3Map = makeVertexToPeptideMap(allCDR3s, vertexStartValue);
+        Map<Integer, Integer> plateVtoCDR3Map = makeVertexToSequenceMap(allCDR3s, vertexStartValue);
        //New start value for vertex to CDR1 map should be one more than final vertex value in CDR3 map
        vertexStartValue += plateVtoCDR3Map.size();
        //keys are sequential integers vertices, values are CDR1s
-        Map<Integer, Integer> plateVtoCDR1Map = makeVertexToPeptideMap(allCDR1s, vertexStartValue);
+        Map<Integer, Integer> plateVtoCDR1Map = makeVertexToSequenceMap(allCDR1s, vertexStartValue);
        //keys are CDR3s, values are sequential integer vertices from previous map
        Map<Integer, Integer> plateCDR3toVMap = invertVertexMap(plateVtoCDR3Map);
        //keys are CDR1s, values are sequential integer vertices from previous map
@@ -374,7 +352,7 @@ public class Simulator {
        Map<Integer, Integer> wellNCDR3s = null;
        Map<Integer, Integer> wellNCDR1s = null;
        double[][] weights = new double[plateVtoCDR3Map.size()][plateVtoCDR1Map.size()];
-        countPeptidesAndFillMatrix(samplePlate, allCDR3s, allCDR1s, plateCDR3toVMap, plateCDR1toVMap,
+        countSequencesAndFillMatrix(samplePlate, allCDR3s, allCDR1s, plateCDR3toVMap, plateCDR1toVMap,
                cdr3Indices, cdr1Indices, cdr3WellCounts, cdr1WellCounts, weights);
        System.out.println("Matrix created");
@@ -383,11 +361,9 @@ public class Simulator {
                new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
        SimpleWeightedBipartiteGraphMatrixGenerator graphGenerator = new SimpleWeightedBipartiteGraphMatrixGenerator();
-        List<Integer> cdr3Vertices = new ArrayList<>();
+        List<Integer> cdr3Vertices = new ArrayList<>(plateVtoCDR3Map.keySet()); //This will work because LinkedHashMap preserves order of entry
        cdr3Vertices.addAll(plateVtoCDR3Map.keySet()); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.first(cdr3Vertices);
-        List<Integer> cdr1Vertices = new ArrayList<>();
+        List<Integer> cdr1Vertices = new ArrayList<>(plateVtoCDR1Map.keySet());
        cdr1Vertices.addAll(plateVtoCDR1Map.keySet());
        graphGenerator.second(cdr1Vertices); //This will work because LinkedHashMap preserves order of entry
        graphGenerator.weights(weights);
        graphGenerator.generateGraph(graph);
@@ -407,7 +383,7 @@ public class Simulator {
        //first processing run
        Map<Integer, Integer> firstMatchCDR3toCDR1Map = new HashMap<>();
        Iterator<DefaultWeightedEdge> weightIter = graphMatching.iterator();
-        DefaultWeightedEdge e = null;
+        DefaultWeightedEdge e;
        while(weightIter.hasNext()){
            e = weightIter.next();
 //            if(graph.getEdgeWeight(e) < lowThreshold || graph.getEdgeWeight(e) > highThreshold) {
@@ -531,7 +507,7 @@ public class Simulator {
        comments.add("Correct matching rate: " + correctRate);
        NumberFormat nf = NumberFormat.getInstance(Locale.US);
        Duration time = Duration.between(start, stop);
-        time.plus(previousTime);
+        time = time.plus(previousTime);
        comments.add("Simulation time: " + nf.format(time.toSeconds()) + " seconds");
        for(String s: comments){
            System.out.println(s);
@@ -605,38 +581,38 @@ public class Simulator {
    //Counts the well occupancy of the row peptides and column peptides into given maps, and
    //fills weights in the given 2D array
-    private static void countPeptidesAndFillMatrix(Plate samplePlate,
+    private static void countSequencesAndFillMatrix(Plate samplePlate,
-                                                   Map<Integer,Integer> allRowPeptides,
+                                                    Map<Integer,Integer> allRowSequences,
-                                                   Map<Integer,Integer> allColumnPeptides,
+                                                    Map<Integer,Integer> allColumnSequences,
-                                                   Map<Integer,Integer> rowPeptideToVertexMap,
+                                                    Map<Integer,Integer> rowSequenceToVertexMap,
-                                                   Map<Integer,Integer> columnPeptideToVertexMap,
+                                                    Map<Integer,Integer> columnSequenceToVertexMap,
-                                                   int[] rowPeptideIndices,
+                                                    int[] rowSequenceIndices,
-                                                   int[] colPeptideIndices,
+                                                    int[] colSequenceIndices,
-                                                   Map<Integer, Integer> rowPeptideCounts,
+                                                    Map<Integer, Integer> rowSequenceCounts,
-                                                   Map<Integer,Integer> columnPeptideCounts,
+                                                    Map<Integer,Integer> columnSequenceCounts,
-                                                   double[][] weights){
+                                                    double[][] weights){
-        Map<Integer, Integer> wellNRowPeptides = null;
+        Map<Integer, Integer> wellNRowSequences = null;
-        Map<Integer, Integer> wellNColumnPeptides = null;
+        Map<Integer, Integer> wellNColumnSequences = null;
-        int vertexStartValue = rowPeptideToVertexMap.size();
+        int vertexStartValue = rowSequenceToVertexMap.size();
        int numWells = samplePlate.getSize();
        for (int n = 0; n < numWells; n++) {
-            wellNRowPeptides = samplePlate.assayWellsPeptideP(n, rowPeptideIndices);
+            wellNRowSequences = samplePlate.assayWellsSequenceS(n, rowSequenceIndices);
-            for (Integer a : wellNRowPeptides.keySet()) {
+            for (Integer a : wellNRowSequences.keySet()) {
-                if(allRowPeptides.containsKey(a)){
+                if(allRowSequences.containsKey(a)){
-                    rowPeptideCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
+                    rowSequenceCounts.merge(a, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
-            wellNColumnPeptides = samplePlate.assayWellsPeptideP(n, colPeptideIndices);
+            wellNColumnSequences = samplePlate.assayWellsSequenceS(n, colSequenceIndices);
-            for (Integer b : wellNColumnPeptides.keySet()) {
+            for (Integer b : wellNColumnSequences.keySet()) {
-                if(allColumnPeptides.containsKey(b)){
+                if(allColumnSequences.containsKey(b)){
-                    columnPeptideCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
+                    columnSequenceCounts.merge(b, 1, (oldValue, newValue) -> oldValue + newValue);
                }
            }
-            for (Integer i : wellNRowPeptides.keySet()) {
+            for (Integer i : wellNRowSequences.keySet()) {
-                if(allRowPeptides.containsKey(i)){
+                if(allRowSequences.containsKey(i)){
-                    for (Integer j : wellNColumnPeptides.keySet()) {
+                    for (Integer j : wellNColumnSequences.keySet()) {
-                        if(allColumnPeptides.containsKey(j)){
+                        if(allColumnSequences.containsKey(j)){
-                            weights[rowPeptideToVertexMap.get(i)][columnPeptideToVertexMap.get(j) - vertexStartValue] += 1.0;
+                            weights[rowSequenceToVertexMap.get(i)][columnSequenceToVertexMap.get(j) - vertexStartValue] += 1.0;
                        }
                    }
                }
@@ -666,19 +642,54 @@ public class Simulator {
        }
    }
-    private static Map<Integer, Integer> makePeptideToPeptideMap(List<Integer[]> cells, int keyPeptideIndex,
+    //Remove edges for pairs with large occupancy discrepancy
-                                                                 int valuePeptideIndex){
+    private static void filterByRelativeOccupancy(SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph,
-        Map<Integer, Integer> keyPeptideToValuePeptideMap = new HashMap<>();
+                                                  Map<Integer, Integer> alphaWellCounts,
-        for (Integer[] cell : cells) {
+                                                  Map<Integer, Integer> betaWellCounts,
-           keyPeptideToValuePeptideMap.put(cell[keyPeptideIndex], cell[valuePeptideIndex]);
+                                                  Map<Integer, Integer> plateVtoAMap,
                                                  Map<Integer, Integer> plateVtoBMap,
                                                  Integer maxOccupancyDifference) {
        for (DefaultWeightedEdge e : graph.edgeSet()) {
            Integer alphaOcc = alphaWellCounts.get(plateVtoAMap.get(graph.getEdgeSource(e)));
            Integer betaOcc = betaWellCounts.get(plateVtoBMap.get(graph.getEdgeTarget(e)));
            //Adjust this to something cleverer later
            if (Math.abs(alphaOcc - betaOcc) >= maxOccupancyDifference) {
                graph.setEdgeWeight(e, 0.0);
            }
        }
        return keyPeptideToValuePeptideMap;
    }
-    private static Map<Integer, Integer> makeVertexToPeptideMap(Map<Integer, Integer> peptides, Integer startValue) {
+    //Remove edges for pairs where overlap size is significantly lower than the well occupancy
    private static void filterByOverlapSize(SimpleWeightedGraph<Integer, DefaultWeightedEdge> graph,
                                            Map<Integer, Integer> alphaWellCounts,
                                            Map<Integer, Integer> betaWellCounts,
                                            Map<Integer, Integer> plateVtoAMap,
                                            Map<Integer, Integer> plateVtoBMap,
                                            Integer minOverlapPercent) {
        for (DefaultWeightedEdge e : graph.edgeSet()) {
            Integer alphaOcc = alphaWellCounts.get(plateVtoAMap.get(graph.getEdgeSource(e)));
            Integer betaOcc = betaWellCounts.get(plateVtoBMap.get(graph.getEdgeTarget(e)));
            double weight = graph.getEdgeWeight(e);
            double min = minOverlapPercent / 100.0;
            if ((weight / alphaOcc < min) || (weight / betaOcc < min)) {
                graph.setEdgeWeight(e, 0.0);
            }
        }
    }
    private static Map<Integer, Integer> makeSequenceToSequenceMap(List<Integer[]> cells, int keySequenceIndex,
                                                                   int valueSequenceIndex){
        Map<Integer, Integer> keySequenceToValueSequenceMap = new HashMap<>();
        for (Integer[] cell : cells) {
           keySequenceToValueSequenceMap.put(cell[keySequenceIndex], cell[valueSequenceIndex]);
        }
        return keySequenceToValueSequenceMap;
    }
    private static Map<Integer, Integer> makeVertexToSequenceMap(Map<Integer, Integer> sequences, Integer startValue) {
        Map<Integer, Integer> map = new LinkedHashMap<>(); //LinkedHashMap to preserve order of entry
        Integer index = startValue;
-        for (Integer k: peptides.keySet()) {
+        for (Integer k: sequences.keySet()) {
            map.put(index, k);
            index++;
        }
@@ -693,9 +704,4 @@ public class Simulator {
        return inverse;
    }
    private static int bigDecimalComparator(BigDecimal bd, Integer i) {
        return bd.compareTo(BigDecimal.valueOf(i));
    }
 }
--- a/src/main/java/UserInterface.java
+++ b/src/main/java/UserInterface.java
--- a/src/main/java/Vertex.java
+++ b/src/main/java/Vertex.java
@@ -0,0 +1,17 @@
 public class Vertex {
    private final Integer peptide;
    private final Integer occupancy;
    public Vertex(Integer peptide, Integer occupancy) {
        this.peptide = peptide;
        this.occupancy = occupancy;
    }
    public Integer getPeptide() {
        return peptide;
    }
    public Integer getOccupancy() {
        return occupancy;
    }
 }
Author	SHA1	Message	Date
efischer	4b053e6ec4	Remove artifacts from tracking to stop repo bloat.	2022-02-22 16:14:50 -06:00
efischer	44784b7976	Remove artifacts from tracking to stop repo bloat.	2022-02-22 16:10:22 -06:00
efischer	7c19896dc9	update readme	2022-02-22 16:09:50 -06:00
efischer	aec7e3016f	Typos in documentation	2022-02-21 11:19:54 -06:00
efischer	5c75c1ac09	Update readme.md	2022-02-21 06:53:30 +00:00
efischer	cb1f7adece	Change "peptide" references in code to "sequence", adding comments	2022-02-21 00:29:34 -06:00
efischer	370de79546	Add performance section to readme	2022-02-21 00:02:49 -06:00
efischer	a803336f56	Add performance section to readme	2022-02-21 00:01:20 -06:00
efischer	94b54b3416	Add performance section to readme	2022-02-20 23:31:25 -06:00
efischer	601e141fd0	Update readme	2022-02-20 22:51:49 -06:00
efischer	8f9c6b7d33	Update readme TODO	2022-02-20 20:59:05 -06:00
efischer	e5ddc73723	Finish reverting back to wells-as-rows	2022-02-20 20:54:44 -06:00
efischer	9b18fac74f	Invoke garbage collection	2022-02-20 20:47:12 -06:00
efischer	63ef6aa7a0	Revert attempt to switch plate output format. It worked, but introduced a bug in graph filtering I don't want to chase down	2022-02-20 20:45:35 -06:00
efischer	7558455f39	Correct errors in output and documentation	2022-02-20 20:13:38 -06:00
efischer	410f0ae547	Remove testing code, add garbage collection calls	2022-02-20 20:06:45 -06:00
efischer	1bc6a11545	Change plate reader/writer to use columns as wells	2022-02-20 19:58:24 -06:00
efischer	2b13e10e95	Change plate reader/writer to use columns as wells	2022-02-20 19:48:09 -06:00
efischer	4fd5baeb98	Change plate reader/writer to use columns as wells	2022-02-20 19:41:06 -06:00
efischer	b2a4e9a42b	Change plate reader/writer to use columns as wells	2022-02-20 19:17:56 -06:00
efischer	d1bb49b482	Change plate reader/writer to use columns as wells	2022-02-20 19:12:11 -06:00
efischer	9adb7dffb8	Change plate reader/writer to use columns as wells	2022-02-20 19:08:04 -06:00
efischer	2023bb9d7e	Cleanup file output, add UI verbosity	2022-02-20 18:31:31 -06:00
efischer	405fbf17ff	improve documentation	2022-02-20 17:11:39 -06:00
efischer	24519f4a52	improve documentation	2022-02-20 17:04:25 -06:00
efischer	2afd01eeef	improve documentation	2022-02-20 15:48:11 -06:00
efischer	10d0b711bf	improve documentation	2022-02-20 15:38:40 -06:00
efischer	8f98baf44e	improve documentation	2022-02-20 15:37:39 -06:00
efischer	d6c7c40c96	improve documentation	2022-02-20 13:23:15 -06:00
efischer	61c14b2ecf	improve documentation	2022-02-20 13:20:47 -06:00
efischer	22fc4aedfe	improve documentation	2022-02-20 13:18:49 -06:00
efischer	5d24dc6f70	improve documentation	2022-02-20 13:15:32 -06:00
efischer	2c01a0211c	move readme	2022-02-20 12:02:27 -06:00
efischer	f2b5d9e1b7	Rename and update readme	2022-02-20 11:58:12 -06:00
efischer	74c8cafd81	scan for filename	2022-02-20 03:08:31 -06:00
efischer	d1c37b5ccd	Relocate overlap threshold filters	2022-02-20 03:05:56 -06:00
efischer	cb2c5a6024	Add plate well concentrations to output data	2022-02-20 02:29:42 -06:00
efischer	284a5b3a40	Add plate well concentrations to output data	2022-02-20 02:23:31 -06:00
efischer	52afb1edc2	Add plate well concentrations to output data	2022-02-20 02:17:36 -06:00
efischer	9c52bc878a	Add plate well concentrations to output data	2022-02-20 02:13:13 -06:00
efischer	248fe4d662	Add plate well concentrations to output data	2022-02-20 02:09:22 -06:00
efischer	5d0e60708c	Add plate well concentrations to output data	2022-02-20 01:53:34 -06:00
efischer	c96b7237e9	Add plate well concentrations to output data	2022-02-20 01:40:01 -06:00
efischer	0b28259800	Add plate well concentrations to output data	2022-02-20 01:13:22 -06:00
efischer	837ef7bfe4	UI cleanup, some code cleanup	2022-02-20 01:05:28 -06:00
efischer	0bebbc7602	Add missing filtering code	2022-02-19 22:56:38 -06:00
efischer	84f7ddb696	Fix interactive output	2022-02-19 22:49:50 -06:00
efischer	c4633da9eb	Correct propogation of peptide counts	2022-02-19 22:33:38 -06:00
efischer	5b2ed165d0	Clean up interactive text, bugfix	2022-02-19 22:21:09 -06:00
efischer	0026d8cdfe	Use buffered input/output streams	2022-02-19 22:04:41 -06:00
efischer	13fb7168bf	Refactor to read/write files of graph and map data	2022-02-19 21:46:01 -06:00
efischer	568a6be3c7	Refactoring to allow graphs from file	2022-02-19 17:23:55 -06:00
efischer	cfa473c7ce	Adding parameters to filter by occupancy difference and percent overlap	2022-02-19 14:06:11 -06:00
efischer	6faacd9a82	Adding parameters to filter by occupancy difference and percent overlap	2022-02-19 14:05:26 -06:00
efischer	ce88e170c1	Update readme with max memory flag	2022-02-18 17:48:25 -06:00
efischer	47e23addfa	Do new filtering before matching	2022-02-18 17:42:05 -06:00
efischer	b9ee31b64c	Do new filtering before matching	2022-02-18 17:28:24 -06:00