4 Commits

Author SHA1 Message Date
eugenefischer
b7597cff2a update readme and add Zipf exponent option to CLI 2025-04-09 16:16:46 -05:00
eugenefischer
7bbeaf7dad update readme 2025-04-09 14:40:49 -05:00
eugenefischer
945b967382 update readme 2025-04-09 14:39:46 -05:00
eugenefischer
a43ee469ea implement Zipf distribution 2025-04-09 14:32:02 -05:00
9 changed files with 177 additions and 56 deletions

1
.idea/.name generated Normal file
View File

@@ -0,0 +1 @@
BiGpairSEQ

13
.idea/libraries/commons_rng_1.xml generated Normal file
View File

@@ -0,0 +1,13 @@
<component name="libraryTable">
<library name="commons-rng-1">
<CLASSES>
<root url="file://$USER_HOME$/Downloads/commons-rng-1.6" />
</CLASSES>
<JAVADOC />
<SOURCES>
<root url="file://$USER_HOME$/Downloads/commons-rng-1.6" />
</SOURCES>
<jarDirectory url="file://$USER_HOME$/Downloads/commons-rng-1.6" recursive="false" />
<jarDirectory url="file://$USER_HOME$/Downloads/commons-rng-1.6" recursive="false" type="SOURCES" />
</library>
</component>

View File

@@ -136,7 +136,7 @@ There are a number of command line options, to allow the program to be used in s
`java -jar BiGpairSEQ_Sim.jar -help`
```
usage: BiGpairSEQ_Sim.jar
usage: BiGpairSEQ_Sim.jar
-cells,--make-cells Makes a cell sample file of distinct T cells
-graph,--make-graph Makes a graph/data file. Requires a cell sample
file and a sample plate file
@@ -156,6 +156,8 @@ usage: BiGpairSEQ_Sim.jar -plate
-c,--cell-file <filename> The cell sample file to use
-d,--dropout-rate <rate> The sequence dropout rate due to
amplification error. (0.0 - 1.0)
-exp <value> If using -zipf flag, exponent value for
distribution
-exponential Use an exponential distribution for cell
sample
-gaussian Use a Gaussian distribution for cell sample
@@ -173,6 +175,7 @@ usage: BiGpairSEQ_Sim.jar -plate
-stddev <value> If using -gaussian flag, standard deviation
for distrbution
-w,--wells <number> The number of wells on the sample plate
-zipf Use a Zipf distribution for cell sample
usage: BiGpairSEQ_Sim.jar -graph
-c,--cell-file <filename> Cell sample file to use for
@@ -234,7 +237,6 @@ usage: BiGpairSEQ_Sim.jar -match
to stdout.
-pv,--p-value (Optional) Calculate p-values for sequence
pairs.
```
### INTERACTIVE INTERFACE
@@ -340,6 +342,8 @@ Options when making a Sample Plate file:
* Standard deviation size
* Exponential
* Lambda value
* Zipf
* Exponent value
* Total number of wells on the plate
* Well populations random or fixed
* If random, minimum and maximum population sizes
@@ -630,6 +634,7 @@ a means of exploring some very beautiful math.
## TODO
* Update CLI option text in this readme to include Zipf distribution options
* ~~Try invoking GC at end of workloads to reduce paging to disk~~ DONE
* ~~Hold graph data in memory until another graph is read-in? ABANDONED UNABANDONED~~ DONE
* ~~*No, this won't work, because BiGpairSEQ simulations alter the underlying graph based on filtering constraints. Changes would cascade with multiple experiments.*~~

View File

@@ -15,6 +15,7 @@ public class BiGpairSEQ {
private static boolean cacheGraph = false;
private static AlgorithmType matchingAlgoritmType = AlgorithmType.HUNGARIAN;
private static HeapType priorityQueueHeapType = HeapType.PAIRING;
private static DistributionType distributionType = DistributionType.ZIPF;
private static boolean outputBinary = true;
private static boolean outputGraphML = false;
private static boolean calculatePValue = false;
@@ -60,6 +61,10 @@ public class BiGpairSEQ {
return cellFilename;
}
public static DistributionType getDistributionType() {return distributionType;}
public static void setDistributionType(DistributionType type) {distributionType = type;}
public static Plate getPlateInMemory() {
return plateInMemory;
}

View File

@@ -123,16 +123,20 @@ public class CommandLineInterface {
Plate plate;
if (line.hasOption("poisson")) {
Double stdDev = Math.sqrt(numWells);
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev, false);
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev);
}
else if (line.hasOption("gaussian")) {
Double stdDev = Double.parseDouble(line.getOptionValue("stddev"));
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev, false);
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev);
}
else if (line.hasOption("zipf")) {
Double zipfExponent = Double.parseDouble(line.getOptionValue("exp"));
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, zipfExponent);
}
else {
assert line.hasOption("exponential");
Double lambda = Double.parseDouble(line.getOptionValue("lambda"));
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, lambda, true);
plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, lambda);
}
PlateFileWriter writer = new PlateFileWriter(outputFilename, plate);
writer.writePlateFile();
@@ -340,9 +344,13 @@ public class CommandLineInterface {
Option exponential = Option.builder("exponential")
.desc("Use an exponential distribution for cell sample")
.build();
Option zipf = Option.builder("zipf")
.desc("Use a Zipf distribution for cell sample")
.build();
distributions.addOption(poisson);
distributions.addOption(gaussian);
distributions.addOption(exponential);
distributions.addOption(zipf);
//options group for statistical distribution parameters
OptionGroup statParams = new OptionGroup();// add this to plate options
Option stdDev = Option.builder("stddev")
@@ -355,6 +363,11 @@ public class CommandLineInterface {
.hasArg()
.argName("value")
.build();
Option zipfExponent = Option.builder("exp")
.desc("If using -zipf flag, exponent value for distribution")
.hasArg()
.argName("value")
.build();
statParams.addOption(stdDev);
statParams.addOption(lambda);
//Option group for random plate or set populations
@@ -386,6 +399,7 @@ public class CommandLineInterface {
plateOptions.addOptionGroup(statParams);
plateOptions.addOptionGroup(wellPopOptions);
plateOptions.addOption(dropoutRate);
plateOptions.addOption(zipfExponent);
plateOptions.addOption(outputFileOption());
return plateOptions;
}

View File

@@ -0,0 +1,6 @@
public enum DistributionType {
POISSON,
GAUSSIAN,
EXPONENTIAL,
ZIPF
}

View File

@@ -89,14 +89,12 @@ public class InteractiveInterface {
private static void makePlate() {
String cellFile = null;
String filename = null;
Double stdDev = 0.0;
Double parameter = 0.0;
Integer numWells = 0;
Integer numSections;
Integer[] populations = {1};
Double dropOutRate = 0.0;
boolean poisson = false;
boolean exponential = false;
double lambda = 1.5;
;
try {
System.out.println("\nSimulated sample plates consist of:");
System.out.println("* a number of wells");
@@ -114,33 +112,46 @@ public class InteractiveInterface {
System.out.println("1) Poisson");
System.out.println("2) Gaussian");
System.out.println("3) Exponential");
// System.out.println("(Note: approximate distribution in original paper is exponential, lambda = 0.6)");
// System.out.println("(lambda value approximated from slope of log-log graph in figure 4c)");
System.out.println("4) Zipf");
System.out.println("(Note: wider distributions are more memory intensive to match)");
System.out.print("Enter selection value: ");
input = sc.nextInt();
switch (input) {
case 1 -> poisson = true;
case 1 -> {
BiGpairSEQ.setDistributionType(DistributionType.POISSON);
}
case 2 -> {
BiGpairSEQ.setDistributionType(DistributionType.GAUSSIAN);
System.out.println("How many distinct T-cells within one standard deviation of peak frequency?");
System.out.println("(Note: wider distributions are more memory intensive to match)");
stdDev = sc.nextDouble();
if (stdDev <= 0.0) {
parameter = sc.nextDouble();
if (parameter <= 0.0) {
throw new InputMismatchException("Value must be positive.");
}
}
case 3 -> {
exponential = true;
BiGpairSEQ.setDistributionType(DistributionType.EXPONENTIAL);
System.out.print("Please enter lambda value for exponential distribution: ");
lambda = sc.nextDouble();
if (lambda <= 0.0) {
lambda = 0.6;
System.out.println("Value must be positive. Defaulting to 0.6.");
parameter = sc.nextDouble();
if (parameter <= 0.0) {
parameter = 1.4;
System.out.println("Value must be positive. Defaulting to 1.4.");
}
}
case 4 -> {
BiGpairSEQ.setDistributionType(DistributionType.ZIPF);
System.out.print("Please enter exponent value for Zipf distribution: ");
parameter = sc.nextDouble();
if (parameter <= 0.0) {
parameter = 1.4;
System.out.println("Value must be positive. Defaulting to 1.4.");
}
}
default -> {
System.out.println("Invalid input. Defaulting to exponential.");
exponential = true;
parameter = 1.4;
BiGpairSEQ.setDistributionType(DistributionType.EXPONENTIAL);
}
}
System.out.print("\nNumber of wells on plate: ");
@@ -226,17 +237,18 @@ public class InteractiveInterface {
assert filename != null;
Plate samplePlate;
PlateFileWriter writer;
if(exponential){
samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, lambda, true);
DistributionType type = BiGpairSEQ.getDistributionType();
switch(type) {
case POISSON -> {
parameter = Math.sqrt(cells.getCellCount()); //gaussian with square root of elements approximates poisson
samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, parameter);
writer = new PlateFileWriter(filename, samplePlate);
}
else {
if (poisson) {
stdDev = Math.sqrt(cells.getCellCount()); //gaussian with square root of elements approximates poisson
}
samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, stdDev, false);
default -> {
samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, parameter);
writer = new PlateFileWriter(filename, samplePlate);
}
}
System.out.println("Writing Sample Plate to file");
writer.writePlateFile();
System.out.println("Sample Plate written to file: " + filename);
@@ -605,12 +617,13 @@ public class InteractiveInterface {
case 3 -> {
BiGpairSEQ.setAuctionAlgorithm();
System.out.println("MWM algorithm set to auction");
backToOptions = true;
}
case 4 -> {
System.out.println("Scaling integer weight MWM algorithm not yet fully implemented. Sorry.");
// BiGpairSEQ.setIntegerWeightScalingAlgorithm();
// System.out.println("MWM algorithm set to integer weight scaling algorithm of Duan and Su");
backToOptions = true;
// backToOptions = true;
}
case 0 -> backToOptions = true;
default -> System.out.println("Invalid input");

View File

@@ -13,6 +13,11 @@ TODO: Implement discrete frequency distributions using Vose's Alias Method
*/
import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.rng.core.BaseProvider;
import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
import org.apache.commons.rng.simple.JDKRandomWrapper;
import java.util.*;
public class Plate {
@@ -26,25 +31,22 @@ public class Plate {
private Integer[] populations;
private double stdDev;
private double lambda;
boolean exponential = false;
private double zipfExponent;
private DistributionType distributionType;
public Plate(CellSample cells, String cellFilename, int numWells, Integer[] populations,
double dropoutRate, double stdDev_or_lambda, boolean exponential){
double dropoutRate, double parameter){
this.cells = cells;
this.sourceFile = cellFilename;
this.size = numWells;
this.wells = new ArrayList<>();
this.error = dropoutRate;
this.populations = populations;
this.exponential = exponential;
if (this.exponential) {
this.lambda = stdDev_or_lambda;
fillWellsExponential(cells.getCells(), this.lambda);
}
else {
this.stdDev = stdDev_or_lambda;
fillWells(cells.getCells(), this.stdDev);
}
this.stdDev = parameter;
this.lambda = parameter;
this.zipfExponent = parameter;
this.distributionType = BiGpairSEQ.getDistributionType();
fillWells(cells.getCells());
}
@@ -85,9 +87,33 @@ public class Plate {
}
}
private void fillWellsZipf(List<String[]> cells, double exponent) {
int numSections = populations.length;
int section = 0;
int n;
RejectionInversionZipfSampler zipfSampler = new RejectionInversionZipfSampler(new JDKRandomWrapper(rand), cells.size(), exponent);
while (section < numSections){
for (int i = 0; i < (size / numSections); i++) {
List<String[]> well = new ArrayList<>();
for (int j = 0; j < populations[section]; j++) {
do {
n = zipfSampler.sample();
} while (n >= cells.size() || n < 0);
String[] cellToAdd = cells.get(n).clone();
for(int k = 0; k < cellToAdd.length; k++){
if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
cellToAdd[k] = "-1";
}
}
well.add(cellToAdd);
}
wells.add(well);
}
section++;
}
}
private void fillWellsExponential(List<String[]> cells, double lambda){
this.lambda = lambda;
exponential = true;
int numSections = populations.length;
int section = 0;
double m;
@@ -143,6 +169,24 @@ public class Plate {
}
}
private void fillWells(List<String[]> cells){
DistributionType type = BiGpairSEQ.getDistributionType();
switch (type) {
case POISSON, GAUSSIAN -> {
fillWells(cells, getStdDev());
break;
}
case EXPONENTIAL -> {
fillWellsExponential(cells, getLambda());
break;
}
case ZIPF -> {
fillWellsZipf(cells, getZipfExponent());
break;
}
}
}
public Integer[] getPopulations(){
return populations;
}
@@ -155,10 +199,12 @@ public class Plate {
return stdDev;
}
public boolean isExponential(){return exponential;}
public DistributionType getDistributionType() { return distributionType;}
public double getLambda(){return lambda;}
public double getZipfExponent(){return zipfExponent;}
public double getError() {
return error;
}

View File

@@ -13,11 +13,13 @@ public class PlateFileWriter {
private List<List<String[]>> wells;
private double stdDev;
private double lambda;
private double zipfExponent;
private DistributionType distributionType;
private Double error;
private String filename;
private String sourceFileName;
private Integer[] populations;
private boolean isExponential = false;
public PlateFileWriter(String filename, Plate plate) {
if(!filename.matches(".*\\.csv")){
@@ -26,12 +28,17 @@ public class PlateFileWriter {
this.filename = filename;
this.sourceFileName = plate.getSourceFileName();
this.size = plate.getSize();
this.isExponential = plate.isExponential();
if(isExponential) {
this.distributionType = plate.getDistributionType();
switch(distributionType) {
case POISSON, GAUSSIAN -> {
this.stdDev = plate.getStdDev();
}
case EXPONENTIAL -> {
this.lambda = plate.getLambda();
}
else{
this.stdDev = plate.getStdDev();
case ZIPF -> {
this.zipfExponent = plate.getZipfExponent();
}
}
this.error = plate.getError();
this.wells = plate.getWells();
@@ -95,11 +102,22 @@ public class PlateFileWriter {
printer.printComment("Plate size: " + size);
printer.printComment("Well populations: " + wellPopulationsString);
printer.printComment("Error rate: " + error);
if(isExponential){
printer.printComment("Lambda: " + lambda);
switch (distributionType) {
case POISSON -> {
printer.printComment("Cell frequency distribution: POISSON");
}
case GAUSSIAN -> {
printer.printComment("Cell frequency distribution: GAUSSIAN");
printer.printComment("--Standard deviation: " + stdDev);
}
case EXPONENTIAL -> {
printer.printComment("Cell frequency distribution: EXPONENTIAL");
printer.printComment("--Lambda: " + lambda);
}
case ZIPF -> {
printer.printComment("Cell frequency distribution: ZIPF");
printer.printComment("--Exponent: " + zipfExponent);
}
else {
printer.printComment("Std. dev.: " + stdDev);
}
printer.printRecords(wellsAsStrings);
} catch(IOException ex){