From a43ee469ea79a540095f97881db8d17b361a264b Mon Sep 17 00:00:00 2001 From: eugenefischer <66030419+eugenefischer@users.noreply.github.com> Date: Wed, 9 Apr 2025 14:32:02 -0500 Subject: [PATCH] implement Zipf distribution --- .idea/.name | 1 + .idea/libraries/commons_rng_1.xml | 13 +++++ src/main/java/BiGpairSEQ.java | 5 ++ src/main/java/CommandLineInterface.java | 19 ++++++- src/main/java/DistributionType.java | 6 ++ src/main/java/InteractiveInterface.java | 63 ++++++++++++--------- src/main/java/Plate.java | 74 ++++++++++++++++++++----- src/main/java/PlateFileWriter.java | 42 ++++++++++---- 8 files changed, 169 insertions(+), 54 deletions(-) create mode 100644 .idea/.name create mode 100644 .idea/libraries/commons_rng_1.xml create mode 100644 src/main/java/DistributionType.java diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..54e592b --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +BiGpairSEQ \ No newline at end of file diff --git a/.idea/libraries/commons_rng_1.xml b/.idea/libraries/commons_rng_1.xml new file mode 100644 index 0000000..8523868 --- /dev/null +++ b/.idea/libraries/commons_rng_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/java/BiGpairSEQ.java b/src/main/java/BiGpairSEQ.java index 17aad88..674e9d2 100644 --- a/src/main/java/BiGpairSEQ.java +++ b/src/main/java/BiGpairSEQ.java @@ -15,6 +15,7 @@ public class BiGpairSEQ { private static boolean cacheGraph = false; private static AlgorithmType matchingAlgoritmType = AlgorithmType.HUNGARIAN; private static HeapType priorityQueueHeapType = HeapType.PAIRING; + private static DistributionType distributionType = DistributionType.ZIPF; private static boolean outputBinary = true; private static boolean outputGraphML = false; private static boolean calculatePValue = false; @@ -60,6 +61,10 @@ public class BiGpairSEQ { return cellFilename; } + public static DistributionType getDistributionType() {return distributionType;} + + public static void setDistributionType(DistributionType type) {distributionType = type;} + public static Plate getPlateInMemory() { return plateInMemory; } diff --git a/src/main/java/CommandLineInterface.java b/src/main/java/CommandLineInterface.java index b6ee0fe..18a3fbe 100644 --- a/src/main/java/CommandLineInterface.java +++ b/src/main/java/CommandLineInterface.java @@ -123,16 +123,20 @@ public class CommandLineInterface { Plate plate; if (line.hasOption("poisson")) { Double stdDev = Math.sqrt(numWells); - plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev, false); + plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev); } else if (line.hasOption("gaussian")) { Double stdDev = Double.parseDouble(line.getOptionValue("stddev")); - plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev, false); + plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, stdDev); + } + else if (line.hasOption("zipf")) { + Double zipfExponent = Double.parseDouble(line.getOptionValue("exp")); + plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, zipfExponent); } else { assert line.hasOption("exponential"); Double lambda = Double.parseDouble(line.getOptionValue("lambda")); - plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, lambda, true); + plate = new Plate(cells, cellFilename, numWells, populations, dropoutRate, lambda); } PlateFileWriter writer = new PlateFileWriter(outputFilename, plate); writer.writePlateFile(); @@ -340,9 +344,13 @@ public class CommandLineInterface { Option exponential = Option.builder("exponential") .desc("Use an exponential distribution for cell sample") .build(); + Option zipf = Option.builder("zipf") + .desc("Use a Zipf distribution for cell sample") + .build(); distributions.addOption(poisson); distributions.addOption(gaussian); distributions.addOption(exponential); + distributions.addOption(zipf); //options group for statistical distribution parameters OptionGroup statParams = new OptionGroup();// add this to plate options Option stdDev = Option.builder("stddev") @@ -355,6 +363,11 @@ public class CommandLineInterface { .hasArg() .argName("value") .build(); + Option zipfExponent = Option.builder("exp") + .desc("If using -zipf flag, exponent value for distribution") + .hasArg() + .argName("value") + .build(); statParams.addOption(stdDev); statParams.addOption(lambda); //Option group for random plate or set populations diff --git a/src/main/java/DistributionType.java b/src/main/java/DistributionType.java new file mode 100644 index 0000000..a0272ab --- /dev/null +++ b/src/main/java/DistributionType.java @@ -0,0 +1,6 @@ +public enum DistributionType { + POISSON, + GAUSSIAN, + EXPONENTIAL, + ZIPF +} diff --git a/src/main/java/InteractiveInterface.java b/src/main/java/InteractiveInterface.java index 57f1358..027d0b7 100644 --- a/src/main/java/InteractiveInterface.java +++ b/src/main/java/InteractiveInterface.java @@ -89,14 +89,12 @@ public class InteractiveInterface { private static void makePlate() { String cellFile = null; String filename = null; - Double stdDev = 0.0; + Double parameter = 0.0; Integer numWells = 0; Integer numSections; Integer[] populations = {1}; Double dropOutRate = 0.0; - boolean poisson = false; - boolean exponential = false; - double lambda = 1.5; +; try { System.out.println("\nSimulated sample plates consist of:"); System.out.println("* a number of wells"); @@ -114,33 +112,46 @@ public class InteractiveInterface { System.out.println("1) Poisson"); System.out.println("2) Gaussian"); System.out.println("3) Exponential"); -// System.out.println("(Note: approximate distribution in original paper is exponential, lambda = 0.6)"); -// System.out.println("(lambda value approximated from slope of log-log graph in figure 4c)"); + System.out.println("4) Zipf"); + System.out.println("(Note: wider distributions are more memory intensive to match)"); System.out.print("Enter selection value: "); input = sc.nextInt(); switch (input) { - case 1 -> poisson = true; + case 1 -> { + BiGpairSEQ.setDistributionType(DistributionType.POISSON); + } case 2 -> { + BiGpairSEQ.setDistributionType(DistributionType.GAUSSIAN); System.out.println("How many distinct T-cells within one standard deviation of peak frequency?"); System.out.println("(Note: wider distributions are more memory intensive to match)"); - stdDev = sc.nextDouble(); - if (stdDev <= 0.0) { + parameter = sc.nextDouble(); + if (parameter <= 0.0) { throw new InputMismatchException("Value must be positive."); } } case 3 -> { - exponential = true; + BiGpairSEQ.setDistributionType(DistributionType.EXPONENTIAL); System.out.print("Please enter lambda value for exponential distribution: "); - lambda = sc.nextDouble(); - if (lambda <= 0.0) { - lambda = 0.6; - System.out.println("Value must be positive. Defaulting to 0.6."); + parameter = sc.nextDouble(); + if (parameter <= 0.0) { + parameter = 1.4; + System.out.println("Value must be positive. Defaulting to 1.4."); + } + } + case 4 -> { + BiGpairSEQ.setDistributionType(DistributionType.ZIPF); + System.out.print("Please enter exponent value for Zipf distribution: "); + parameter = sc.nextDouble(); + if (parameter <= 0.0) { + parameter = 1.4; + System.out.println("Value must be positive. Defaulting to 1.4."); } } default -> { System.out.println("Invalid input. Defaulting to exponential."); - exponential = true; + parameter = 1.4; + BiGpairSEQ.setDistributionType(DistributionType.EXPONENTIAL); } } System.out.print("\nNumber of wells on plate: "); @@ -226,16 +237,17 @@ public class InteractiveInterface { assert filename != null; Plate samplePlate; PlateFileWriter writer; - if(exponential){ - samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, lambda, true); - writer = new PlateFileWriter(filename, samplePlate); - } - else { - if (poisson) { - stdDev = Math.sqrt(cells.getCellCount()); //gaussian with square root of elements approximates poisson + DistributionType type = BiGpairSEQ.getDistributionType(); + switch(type) { + case POISSON -> { + parameter = Math.sqrt(cells.getCellCount()); //gaussian with square root of elements approximates poisson + samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, parameter); + writer = new PlateFileWriter(filename, samplePlate); + } + default -> { + samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, parameter); + writer = new PlateFileWriter(filename, samplePlate); } - samplePlate = new Plate(cells, cellFile, numWells, populations, dropOutRate, stdDev, false); - writer = new PlateFileWriter(filename, samplePlate); } System.out.println("Writing Sample Plate to file"); writer.writePlateFile(); @@ -605,12 +617,13 @@ public class InteractiveInterface { case 3 -> { BiGpairSEQ.setAuctionAlgorithm(); System.out.println("MWM algorithm set to auction"); + backToOptions = true; } case 4 -> { System.out.println("Scaling integer weight MWM algorithm not yet fully implemented. Sorry."); // BiGpairSEQ.setIntegerWeightScalingAlgorithm(); // System.out.println("MWM algorithm set to integer weight scaling algorithm of Duan and Su"); - backToOptions = true; +// backToOptions = true; } case 0 -> backToOptions = true; default -> System.out.println("Invalid input"); diff --git a/src/main/java/Plate.java b/src/main/java/Plate.java index d88d7ee..da8ba6b 100644 --- a/src/main/java/Plate.java +++ b/src/main/java/Plate.java @@ -13,6 +13,11 @@ TODO: Implement discrete frequency distributions using Vose's Alias Method */ +import org.apache.commons.rng.UniformRandomProvider; +import org.apache.commons.rng.core.BaseProvider; +import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler; +import org.apache.commons.rng.simple.JDKRandomWrapper; + import java.util.*; public class Plate { @@ -26,25 +31,22 @@ public class Plate { private Integer[] populations; private double stdDev; private double lambda; - boolean exponential = false; + private double zipfExponent; + private DistributionType distributionType; public Plate(CellSample cells, String cellFilename, int numWells, Integer[] populations, - double dropoutRate, double stdDev_or_lambda, boolean exponential){ + double dropoutRate, double parameter){ this.cells = cells; this.sourceFile = cellFilename; this.size = numWells; this.wells = new ArrayList<>(); this.error = dropoutRate; this.populations = populations; - this.exponential = exponential; - if (this.exponential) { - this.lambda = stdDev_or_lambda; - fillWellsExponential(cells.getCells(), this.lambda); - } - else { - this.stdDev = stdDev_or_lambda; - fillWells(cells.getCells(), this.stdDev); - } + this.stdDev = parameter; + this.lambda = parameter; + this.zipfExponent = parameter; + this.distributionType = BiGpairSEQ.getDistributionType(); + fillWells(cells.getCells()); } @@ -85,9 +87,33 @@ public class Plate { } } + private void fillWellsZipf(List cells, double exponent) { + int numSections = populations.length; + int section = 0; + int n; + RejectionInversionZipfSampler zipfSampler = new RejectionInversionZipfSampler(new JDKRandomWrapper(rand), cells.size(), exponent); + while (section < numSections){ + for (int i = 0; i < (size / numSections); i++) { + List well = new ArrayList<>(); + for (int j = 0; j < populations[section]; j++) { + do { + n = zipfSampler.sample(); + } while (n >= cells.size() || n < 0); + String[] cellToAdd = cells.get(n).clone(); + for(int k = 0; k < cellToAdd.length; k++){ + if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence + cellToAdd[k] = "-1"; + } + } + well.add(cellToAdd); + } + wells.add(well); + } + section++; + } + } + private void fillWellsExponential(List cells, double lambda){ - this.lambda = lambda; - exponential = true; int numSections = populations.length; int section = 0; double m; @@ -143,6 +169,24 @@ public class Plate { } } + private void fillWells(List cells){ + DistributionType type = BiGpairSEQ.getDistributionType(); + switch (type) { + case POISSON, GAUSSIAN -> { + fillWells(cells, getStdDev()); + break; + } + case EXPONENTIAL -> { + fillWellsExponential(cells, getLambda()); + break; + } + case ZIPF -> { + fillWellsZipf(cells, getZipfExponent()); + break; + } + } + } + public Integer[] getPopulations(){ return populations; } @@ -155,10 +199,12 @@ public class Plate { return stdDev; } - public boolean isExponential(){return exponential;} + public DistributionType getDistributionType() { return distributionType;} public double getLambda(){return lambda;} + public double getZipfExponent(){return zipfExponent;} + public double getError() { return error; } diff --git a/src/main/java/PlateFileWriter.java b/src/main/java/PlateFileWriter.java index 191ce0e..258e9c7 100644 --- a/src/main/java/PlateFileWriter.java +++ b/src/main/java/PlateFileWriter.java @@ -13,11 +13,13 @@ public class PlateFileWriter { private List> wells; private double stdDev; private double lambda; + private double zipfExponent; + private DistributionType distributionType; private Double error; private String filename; private String sourceFileName; private Integer[] populations; - private boolean isExponential = false; + public PlateFileWriter(String filename, Plate plate) { if(!filename.matches(".*\\.csv")){ @@ -26,12 +28,17 @@ public class PlateFileWriter { this.filename = filename; this.sourceFileName = plate.getSourceFileName(); this.size = plate.getSize(); - this.isExponential = plate.isExponential(); - if(isExponential) { - this.lambda = plate.getLambda(); - } - else{ - this.stdDev = plate.getStdDev(); + this.distributionType = plate.getDistributionType(); + switch(distributionType) { + case POISSON, GAUSSIAN -> { + this.stdDev = plate.getStdDev(); + } + case EXPONENTIAL -> { + this.lambda = plate.getLambda(); + } + case ZIPF -> { + this.zipfExponent = plate.getZipfExponent(); + } } this.error = plate.getError(); this.wells = plate.getWells(); @@ -95,11 +102,22 @@ public class PlateFileWriter { printer.printComment("Plate size: " + size); printer.printComment("Well populations: " + wellPopulationsString); printer.printComment("Error rate: " + error); - if(isExponential){ - printer.printComment("Lambda: " + lambda); - } - else { - printer.printComment("Std. dev.: " + stdDev); + switch (distributionType) { + case POISSON -> { + printer.printComment("Cell frequency distribution: POISSON"); + } + case GAUSSIAN -> { + printer.printComment("Cell frequency distribution: GAUSSIAN"); + printer.printComment("--Standard deviation: " + stdDev); + } + case EXPONENTIAL -> { + printer.printComment("Cell frequency distribution: EXPONENTIAL"); + printer.printComment("--Lambda: " + lambda); + } + case ZIPF -> { + printer.printComment("Cell frequency distribution: ZIPF"); + printer.printComment("--Exponent: " + zipfExponent); + } } printer.printRecords(wellsAsStrings); } catch(IOException ex){