321 lines
14 KiB
Java
321 lines
14 KiB
Java
|
|
|
|
/*
|
|
TODO: Implement exponential distribution using inversion method - DONE
|
|
TODO: Implement collisions with real sequences by having the counting function keep a map of all sequences it's read,
|
|
with values of all misreads. Can then have a spurious/real collision rate, which will have count randomly select a sequence
|
|
it's already read at least once, and put that into the list of spurious sequences for the given real sequence. Will let me get rid
|
|
of the distinctMisreadCount map, and use this new map instead. Doing it this way, once a sequence has been misread as another
|
|
sequence once, it is more likely to be misread that way again, as future read error collisions can also be real sequence collisions
|
|
Prob A: a read error occurs. Prob B: it's a new error (otherwise it's a repeated error). Prob C: if new error, prob that it's
|
|
a real sequence collision (otherwise it's a new spurious sequence) - DONE
|
|
TODO: Implement discrete frequency distributions using Vose's Alias Method
|
|
*/
|
|
|
|
|
|
import org.apache.commons.rng.UniformRandomProvider;
|
|
import org.apache.commons.rng.core.BaseProvider;
|
|
import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler;
|
|
import org.apache.commons.rng.simple.JDKRandomWrapper;
|
|
|
|
import java.util.*;
|
|
|
|
public class Plate {
|
|
private CellSample cells;
|
|
private String sourceFile;
|
|
private String filename;
|
|
private List<List<String[]>> wells;
|
|
private final Random rand = BiGpairSEQ.getRand();
|
|
private int size;
|
|
private double error;
|
|
private Integer[] populations;
|
|
private double stdDev;
|
|
private double lambda;
|
|
private double zipfExponent;
|
|
private DistributionType distributionType;
|
|
|
|
public Plate(CellSample cells, String cellFilename, int numWells, Integer[] populations,
|
|
double dropoutRate, double parameter){
|
|
this.cells = cells;
|
|
this.sourceFile = cellFilename;
|
|
this.size = numWells;
|
|
this.wells = new ArrayList<>();
|
|
this.error = dropoutRate;
|
|
this.populations = populations;
|
|
this.stdDev = parameter;
|
|
this.lambda = parameter;
|
|
this.zipfExponent = parameter;
|
|
this.distributionType = BiGpairSEQ.getDistributionType();
|
|
fillWells(cells.getCells());
|
|
}
|
|
|
|
|
|
public Plate(int size, double error, Integer[] populations) {
|
|
this.size = size;
|
|
this.error = error;
|
|
this.populations = populations;
|
|
wells = new ArrayList<>();
|
|
}
|
|
|
|
//constructor for returning a Plate from a PlateFileReader
|
|
public Plate(String filename, List<List<String[]>> wells) {
|
|
this.filename = filename;
|
|
this.wells = wells;
|
|
this.size = wells.size();
|
|
|
|
double totalCellCount = 0.0;
|
|
double totalDropoutCount = 0.0;
|
|
List<Integer> concentrations = new ArrayList<>();
|
|
for (List<String[]> w: wells) {
|
|
if(!concentrations.contains(w.size())){
|
|
concentrations.add(w.size());
|
|
}
|
|
for (String[] cell: w) {
|
|
totalCellCount += 1.0;
|
|
for (String sequence: cell) {
|
|
if("-1".equals(sequence)) {
|
|
totalDropoutCount += 1.0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
double totalSequenceCount = totalCellCount * 4;
|
|
this.error = totalDropoutCount / totalSequenceCount;
|
|
this.populations = new Integer[concentrations.size()];
|
|
for (int i = 0; i < this.populations.length; i++) {
|
|
this.populations[i] = concentrations.get(i);
|
|
}
|
|
}
|
|
|
|
private void fillWellsZipf(List<String[]> cells, double exponent) {
|
|
int numSections = populations.length;
|
|
int section = 0;
|
|
int n;
|
|
RejectionInversionZipfSampler zipfSampler = new RejectionInversionZipfSampler(new JDKRandomWrapper(rand), cells.size(), exponent);
|
|
while (section < numSections){
|
|
for (int i = 0; i < (size / numSections); i++) {
|
|
List<String[]> well = new ArrayList<>();
|
|
for (int j = 0; j < populations[section]; j++) {
|
|
do {
|
|
n = zipfSampler.sample();
|
|
} while (n >= cells.size() || n < 0);
|
|
String[] cellToAdd = cells.get(n).clone();
|
|
for(int k = 0; k < cellToAdd.length; k++){
|
|
if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
|
|
cellToAdd[k] = "-1";
|
|
}
|
|
}
|
|
well.add(cellToAdd);
|
|
}
|
|
wells.add(well);
|
|
}
|
|
section++;
|
|
}
|
|
}
|
|
|
|
private void fillWellsExponential(List<String[]> cells, double lambda){
|
|
int numSections = populations.length;
|
|
int section = 0;
|
|
double m;
|
|
int n;
|
|
while (section < numSections){
|
|
for (int i = 0; i < (size / numSections); i++) {
|
|
List<String[]> well = new ArrayList<>();
|
|
for (int j = 0; j < populations[section]; j++) {
|
|
do {
|
|
//inverse transform sampling: for random number u in [0,1), x = log(1-u) / (-lambda)
|
|
m = (Math.log10((1 - rand.nextDouble()))/(-lambda)) * Math.sqrt(cells.size());
|
|
} while (m >= cells.size() || m < 0);
|
|
n = (int) Math.floor(m);
|
|
String[] cellToAdd = cells.get(n).clone();
|
|
for(int k = 0; k < cellToAdd.length; k++){
|
|
if(Math.abs(rand.nextDouble()) <= error){//error applied to each sequence
|
|
cellToAdd[k] = "-1";
|
|
}
|
|
}
|
|
well.add(cellToAdd);
|
|
}
|
|
wells.add(well);
|
|
}
|
|
section++;
|
|
}
|
|
}
|
|
|
|
private void fillWells( List<String[]> cells, double stdDev) {
|
|
this.stdDev = stdDev;
|
|
int numSections = populations.length;
|
|
int section = 0;
|
|
double m;
|
|
int n;
|
|
while (section < numSections){
|
|
for (int i = 0; i < (size / numSections); i++) {
|
|
List<String[]> well = new ArrayList<>();
|
|
for (int j = 0; j < populations[section]; j++) {
|
|
do {
|
|
m = (rand.nextGaussian() * stdDev) + (cells.size() / 2);
|
|
} while (m >= cells.size() || m < 0);
|
|
n = (int) Math.floor(m);
|
|
String[] cellToAdd = cells.get(n).clone();
|
|
for(int k = 0; k < cellToAdd.length; k++){
|
|
if(Math.abs(rand.nextDouble()) < error){//error applied to each sequence
|
|
cellToAdd[k] = "-1";
|
|
}
|
|
}
|
|
well.add(cellToAdd);
|
|
}
|
|
wells.add(well);
|
|
}
|
|
section++;
|
|
}
|
|
}
|
|
|
|
private void fillWells(List<String[]> cells){
|
|
DistributionType type = BiGpairSEQ.getDistributionType();
|
|
switch (type) {
|
|
case POISSON, GAUSSIAN -> {
|
|
fillWells(cells, getStdDev());
|
|
break;
|
|
}
|
|
case EXPONENTIAL -> {
|
|
fillWellsExponential(cells, getLambda());
|
|
break;
|
|
}
|
|
case ZIPF -> {
|
|
fillWellsZipf(cells, getZipfExponent());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
public Integer[] getPopulations(){
|
|
return populations;
|
|
}
|
|
|
|
public int getSize(){
|
|
return size;
|
|
}
|
|
|
|
public double getStdDev() {
|
|
return stdDev;
|
|
}
|
|
|
|
public DistributionType getDistributionType() { return distributionType;}
|
|
|
|
public double getLambda(){return lambda;}
|
|
|
|
public double getZipfExponent(){return zipfExponent;}
|
|
|
|
public double getError() {
|
|
return error;
|
|
}
|
|
|
|
public List<List<String[]>> getWells() {
|
|
return wells;
|
|
}
|
|
|
|
//For the sequences at cell indices sIndices, counts number of unique sequences in all wells.
|
|
//Also simulates sequence read errors with given probabilities.
|
|
//Returns a map of SequenceRecords containing plate data for all sequences read.
|
|
//TODO actually implement usage of misreadSequences - DONE
|
|
public Map<String, SequenceRecord> countSequences(Integer readDepth, Double readErrorRate,
|
|
Double errorCollisionRate, Double realSequenceCollisionRate, int... sIndices) {
|
|
SequenceType[] sequenceTypes = EnumSet.allOf(SequenceType.class).toArray(new SequenceType[0]);
|
|
//Map of all real sequences read. Keys are sequences, values are ways sequence has been misread.
|
|
Map<String, List<String>> sequencesAndMisreads = new HashMap<>();
|
|
//Map of all sequences read. Keys are sequences, values are associated SequenceRecords
|
|
Map<String, SequenceRecord> sequenceMap = new LinkedHashMap<>();
|
|
//get list of all distinct, real sequences
|
|
String[] realSequences = assayWells(sIndices).toArray(new String[0]);
|
|
for (int well = 0; well < size; well++) {
|
|
for (String[] cell: wells.get(well)) {
|
|
for (int sIndex: sIndices) {
|
|
//the sequence being read
|
|
String currentSequence = cell[sIndex];
|
|
//skip dropout sequences, which have value -1
|
|
if (!"-1".equals(currentSequence)) {
|
|
//keep rereading the sequence until the read depth is reached
|
|
for (int j = 0; j < readDepth; j++) {
|
|
//The sequence is misread
|
|
if (rand.nextDouble() < readErrorRate) {
|
|
//The sequence hasn't been read or misread before
|
|
if (!sequencesAndMisreads.containsKey(currentSequence)) {
|
|
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
|
}
|
|
//The specific misread hasn't happened before
|
|
if (rand.nextDouble() >= errorCollisionRate || sequencesAndMisreads.get(currentSequence).size() == 0) {
|
|
//The misread doesn't collide with a real sequence already on the plate and some sequences have already been read
|
|
if(rand.nextDouble() >= realSequenceCollisionRate || !sequenceMap.isEmpty()){
|
|
StringBuilder spurious = new StringBuilder(currentSequence);
|
|
for (int k = 0; k <= sequencesAndMisreads.get(currentSequence).size(); k++) {
|
|
spurious.append("*");
|
|
}
|
|
//New sequence record for the spurious sequence
|
|
SequenceRecord tmp = new SequenceRecord(spurious.toString(), sequenceTypes[sIndex]);
|
|
tmp.addRead(well);
|
|
sequenceMap.put(spurious.toString(), tmp);
|
|
//add spurious sequence to list of misreads for the real sequence
|
|
sequencesAndMisreads.get(currentSequence).add(spurious.toString());
|
|
}
|
|
//The misread collides with a real sequence already read from plate
|
|
else {
|
|
String wrongSequence;
|
|
do{
|
|
//get a random real sequence that's been read from the plate before
|
|
int index = rand.nextInt(realSequences.length);
|
|
wrongSequence = realSequences[index];
|
|
//make sure it's not accidentally the *right* sequence
|
|
//Also that it's not a wrong sequence already in the misread list
|
|
} while(currentSequence.equals(wrongSequence) || sequencesAndMisreads.get(currentSequence).contains(wrongSequence));
|
|
//update the SequenceRecord for wrongSequence
|
|
sequenceMap.get(wrongSequence).addRead(well);
|
|
//add wrongSequence to the misreads for currentSequence
|
|
sequencesAndMisreads.get(currentSequence).add(wrongSequence);
|
|
}
|
|
}
|
|
}
|
|
//The sequence is read correctly
|
|
else {
|
|
//the sequence hasn't been read before
|
|
if (!sequenceMap.containsKey(currentSequence)) {
|
|
//create new record for the sequence
|
|
SequenceRecord tmp = new SequenceRecord(currentSequence, sequenceTypes[sIndex]);
|
|
//add this read to the sequence record
|
|
tmp.addRead(well);
|
|
//add the sequence and its record to the sequence map
|
|
sequenceMap.put(currentSequence, tmp);
|
|
//add the sequence to the sequences and misreads map
|
|
sequencesAndMisreads.put(currentSequence, new ArrayList<>());
|
|
}
|
|
//the sequence has been read before
|
|
else {
|
|
//get the sequence's record and add this read to it
|
|
sequenceMap.get(currentSequence).addRead(well);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return sequenceMap;
|
|
}
|
|
|
|
private HashSet<String> assayWells(int[] indices) {
|
|
HashSet<String> allSequences = new HashSet<>();
|
|
for (List<String[]> well: wells) {
|
|
for (String[] cell: well) {
|
|
for(int index: indices) {
|
|
allSequences.add(cell[index]);
|
|
}
|
|
}
|
|
}
|
|
return allSequences;
|
|
}
|
|
|
|
public String getSourceFileName() {
|
|
return sourceFile;
|
|
}
|
|
|
|
public String getFilename() { return filename; }
|
|
}
|