Files
BiGpairSEQ/src/main/java/Plate.java
2025-04-09 14:32:02 -05:00

321 lines
14 KiB
Java

/*
TODO: Implement exponential distribution using inversion method - DONE
TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read,
with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence
it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid
of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another
sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions
Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's
a real sequence collision (otherwise it's a new spurious sequence) - DONE
TODO: Implement discrete frequency distributions using Vose's Alias Method
*/
import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.rng.core.BaseProvider;
import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
import org.apache.commons.rng.simple.JDKRandomWrapper;
import java.util.*;
public class Plate {
private CellSample cells;
private String sourceFile;
private String filename;
private List<List<String[]>> wells;
private final Random rand = BiGpairSEQ.getRand();
private int size;
private double error;
private Integer[] populations;
private double stdDev;
private double lambda;
private double zipfExponent;
private DistributionType distributionType;
public Plate(CellSample cells, String cellFilename, int numWells, Integer[] populations,
double dropoutRate, double parameter){
this.cells = cells;
this.sourceFile = cellFilename;
this.size = numWells;
this.wells = new ArrayList<>();
this.error = dropoutRate;
this.populations = populations;
this.stdDev = parameter;
this.lambda = parameter;
this.zipfExponent = parameter;
this.distributionType = BiGpairSEQ.getDistributionType();
fillWells(cells.getCells());
}
public Plate(int size, double error, Integer[] populations) {
this.size = size;
this.error = error;
this.populations = populations;
wells = new ArrayList<>();
}
//constructor for returning a Plate from a PlateFileReader
public Plate(String filename, List<List<String[]>> wells) {
this.filename = filename;
this.wells = wells;
this.size = wells.size();
double totalCellCount = 0.0;
double totalDropoutCount = 0.0;
List<Integer> concentrations = new ArrayList<>();
for (List<String[]> w: wells) {
if(!concentrations.contains(w.size())){
concentrations.add(w.size());
}
for (String[] cell: w) {
totalCellCount += 1.0;
for (String sequence: cell) {
if("-1".equals(sequence)) {
totalDropoutCount += 1.0;
}
}
}
}
double totalSequenceCount = totalCellCount * 4;
this.error = totalDropoutCount / totalSequenceCount;
this.populations = new Integer[concentrations.size()];
for (int i = 0; i < this.populations.length; i++) {
this.populations[i] = concentrations.get(i);
}
}
private void fillWellsZipf(List<String[]> cells, double exponent) {
int numSections = populations.length;
int section = 0;
int n;
RejectionInversionZipfSampler zipfSampler = new RejectionInversionZipfSampler(new JDKRandomWrapper(rand), cells.size(), exponent);
while (section < numSections){
for (int i = 0; i < (size / numSections); i++) {
List<String[]> well = new ArrayList<>();
for (int j = 0; j < populations[section]; j++) {
do {
n = zipfSampler.sample();
} while (n >= cells.size() || n < 0);
String[] cellToAdd = cells.get(n).clone();
for(int k = 0; k < cellToAdd.length; k++){
if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
cellToAdd[k] = "-1";
}
}
well.add(cellToAdd);
}
wells.add(well);
}
section++;
}
}
private void fillWellsExponential(List<String[]> cells, double lambda){
int numSections = populations.length;
int section = 0;
double m;
int n;
while (section < numSections){
for (int i = 0; i < (size / numSections); i++) {
List<String[]> well = new ArrayList<>();
for (int j = 0; j < populations[section]; j++) {
do {
//inverse transform sampling: for random number u in [0,1), x = log(1-u) / (-lambda)
m = (Math.log10((1 - rand.nextDouble()))/(-lambda)) * Math.sqrt(cells.size());
} while (m >= cells.size() || m < 0);
n = (int) Math.floor(m);
String[] cellToAdd = cells.get(n).clone();
for(int k = 0; k < cellToAdd.length; k++){
if(Math.abs(rand.nextDouble()) <= error){//error applied to each sequence
cellToAdd[k] = "-1";
}
}
well.add(cellToAdd);
}
wells.add(well);
}
section++;
}
}
private void fillWells( List<String[]> cells, double stdDev) {
this.stdDev = stdDev;
int numSections = populations.length;
int section = 0;
double m;
int n;
while (section < numSections){
for (int i = 0; i < (size / numSections); i++) {
List<String[]> well = new ArrayList<>();
for (int j = 0; j < populations[section]; j++) {
do {
m = (rand.nextGaussian() * stdDev) + (cells.size() / 2);
} while (m >= cells.size() || m < 0);
n = (int) Math.floor(m);
String[] cellToAdd = cells.get(n).clone();
for(int k = 0; k < cellToAdd.length; k++){
if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
cellToAdd[k] = "-1";
}
}
well.add(cellToAdd);
}
wells.add(well);
}
section++;
}
}
private void fillWells(List<String[]> cells){
DistributionType type = BiGpairSEQ.getDistributionType();
switch (type) {
case POISSON, GAUSSIAN -> {
fillWells(cells, getStdDev());
break;
}
case EXPONENTIAL -> {
fillWellsExponential(cells, getLambda());
break;
}
case ZIPF -> {
fillWellsZipf(cells, getZipfExponent());
break;
}
}
}
public Integer[] getPopulations(){
return populations;
}
public int getSize(){
return size;
}
public double getStdDev() {
return stdDev;
}
public DistributionType getDistributionType() { return distributionType;}
public double getLambda(){return lambda;}
public double getZipfExponent(){return zipfExponent;}
public double getError() {
return error;
}
public List<List<String[]>> getWells() {
return wells;
}
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells.
//Also simulates sequence read errors with given probabilities.
//Returns a map of SequenceRecords containing plate data for all sequences read.
//TODO actually implement usage of misreadSequences - DONE
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) {
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
//Map of all real sequences read. Keys are sequences, values are ways sequence has been misread.
Map<String, List<String>> sequencesAndMisreads = new HashMap<>();
//Map of all sequences read. Keys are sequences, values are associated SequenceRecords
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
//get list of all distinct, real sequences
String[] realSequences = assayWells(sIndices).toArray(new String[0]);
for (int well = 0; well < size; well++) {
for (String[] cell: wells.get(well)) {
for (int sIndex: sIndices) {
//the sequence being read
String currentSequence = cell[sIndex];
//skip dropout sequences, which have value -1
if (!"-1".equals(currentSequence)) {
//keep rereading the sequence until the read depth is reached
for (int j = 0; j < readDepth; j++) {
//The sequence is misread
if (rand.nextDouble() < readErrorRate) {
//The sequence hasn't been read or misread before
if (!sequencesAndMisreads.containsKey(currentSequence)) {
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
}
//The specific misread hasn't happened before
if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) {
//The misread doesn't collide with a real sequence already on the plate and some sequences have already been read
if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){
StringBuilder spurious = new StringBuilder(currentSequence);
for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) {
spurious.append("*");
}
//New sequence record for the spurious sequence
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
tmp.addRead(well);
sequenceMap.put(spurious.toString(), tmp);
//add spurious sequence to list of misreads for the real sequence
sequencesAndMisreads.get(currentSequence).add(spurious.toString());
}
//The misread collides with a real sequence already read from plate
else {
String wrongSequence;
do{
//get a random real sequence that's been read from the plate before
int index = rand.nextInt(realSequences.length);
wrongSequence = realSequences[index];
//make sure it's not accidentally the *right* sequence
//Also that it's not a wrong sequence already in the misread list
} while(currentSequence.equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence));
//update the SequenceRecord for wrongSequence
sequenceMap.get(wrongSequence).addRead(well);
//add wrongSequence to the misreads for currentSequence
sequencesAndMisreads.get(currentSequence).add(wrongSequence);
}
}
}
//The sequence is read correctly
else {
//the sequence hasn't been read before
if (!sequenceMap.containsKey(currentSequence)) {
//create new record for the sequence
SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]);
//add this read to the sequence record
tmp.addRead(well);
//add the sequence and its record to the sequence map
sequenceMap.put(currentSequence, tmp);
//add the sequence to the sequences and misreads map
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
}
//the sequence has been read before
else {
//get the sequence's record and add this read to it
sequenceMap.get(currentSequence).addRead(well);
}
}
}
}
}
}
}
return sequenceMap;
}
private HashSet<String> assayWells(int[] indices) {
HashSet<String> allSequences = new HashSet<>();
for (List<String[]> well: wells) {
for (String[] cell: well) {
for(int index: indices) {
allSequences.add(cell[index]);
}
}
}
return allSequences;
}
public String getSourceFileName() {
return sourceFile;
}
public String getFilename() { return filename; }
}