2 Commits

4 changed files with 30 additions and 58 deletions

View File

@@ -94,7 +94,7 @@ Options when making a Cell Sample file:
Files are in CSV format. Rows are distinct T cells, columns are sequences within the cells. Files are in CSV format. Rows are distinct T cells, columns are sequences within the cells.
Comments are preceded by `#` Comments are preceded by `#`
Structure example: Structure:
--- ---
# Sample contains 1 unique CDR1 for every 4 unique CDR3s. # Sample contains 1 unique CDR1 for every 4 unique CDR3s.
@@ -136,20 +136,20 @@ Every column represents an individual cell, containing four sequences, represent
Notice that the Alpha CDR1 is missing in the cell above, due to sequence dropout. Notice that the Alpha CDR1 is missing in the cell above, due to sequence dropout.
Dropouts are represented by replacing sequences with the value `-1`. Comments are preceded by `#` Dropouts are represented by replacing sequences with the value `-1`. Comments are preceded by `#`
Structure Example: Structure:
--- ---
``` ```
# Cell source file name: 4MilCells.csv # Cell source file name:
# Plate size: 96 # Each row represents one well on the plate
# Error rate: 0.1 # Plate size:
# Concentrations: 10000 5000 500 # Concentrations:
# Lambda: 0.6 # Lambda:
``` ```
| well 1 | well 2 | well 3| ... | | Well 1, cell 1 | Well 1, cell 2 | Well 1, cell 3| ... |
|---|---|---|---| |---|---|---|---|
| [105383, 786528, 959247, 925928] | [525902, 791533, -1, 866282] | [409236, 132303, 804465, 942261]| ... | | **Well 2, cell 1** | **Well 2, cell 2** | **Well 2, cell 3**| ... |
| [249930, 301502, 970003, 881099] | [523787, 552952, 997194, 970507]| [425363, 417411, 845399, -1]| ... | | **Well 3, cell 1** | **Well 3, cell 2** | **Well 3, cell 3**| ... |
| ... | ... | ... | ... | | ... | ... | ... | ... |
--- ---
@@ -222,10 +222,9 @@ using the (2021 corrected) formula from the original pairSEQ paper. (Howie, et a
## TODO ## TODO
* ~~Try invoking GC at end of workloads to reduce paging to disk~~ DONE * Try invoking GC at end of workloads to reduce paging to disk
* ~~Hold graph data in memory until another graph is read-in?~~ * ~~Hold graph data in memory until another graph is read-in?~~
* No, this won't work, because BiGpairSEQ simulations alter the underlying graph based on filtering constraints. Changes would cascade with multiple experiments. * No, this won't work, because BiGpairSEQ simulations alter the underlying graph based on filtering constraints. Changes would cascade with multiple experiments.
* ~~See if there's a reasonable way to reformat Sample Plate files so that wells are columns instead of rows~~ DONE
* Enable GraphML output in addition to serialized object binaries, for data portability * Enable GraphML output in addition to serialized object binaries, for data portability
* Custom vertex type with attribute for sequence occupancy? * Custom vertex type with attribute for sequence occupancy?
* Re-implement CDR1 matching method * Re-implement CDR1 matching method
@@ -238,7 +237,10 @@ using the (2021 corrected) formula from the original pairSEQ paper. (Howie, et a
* Implement sample plates with random numbers of T cells per well * Implement sample plates with random numbers of T cells per well
* Possible BiGpairSEQ advantage over pairSEQ: BiGpairSEQ is resilient to variations in well populations; pairSEQ is not. * Possible BiGpairSEQ advantage over pairSEQ: BiGpairSEQ is resilient to variations in well populations; pairSEQ is not.
* preliminary data suggests that BiGpairSEQ behaves roughly as though the whole plate had whatever the *average* well concentration is, but that's still speculative. * preliminary data suggests that BiGpairSEQ behaves roughly as though the whole plate had whatever the *average* well concentration is, but that's still speculative.
* See if there's a reasonable way to reformat Sample Plate files so that wells are columns instead of rows
* Problem is variable number of cells in a well
* Apache Commons CSV library writes entries a row at a time
* Can possibly sort the wells by length first, then construct entries
## CITATIONS ## CITATIONS
* Howie, B., Sherwood, A. M., et al. ["High-throughput pairing of T cell receptor alpha and beta sequences."](https://pubmed.ncbi.nlm.nih.gov/26290413/) Sci. Transl. Med. 7, 301ra131 (2015) * Howie, B., Sherwood, A. M., et al. ["High-throughput pairing of T cell receptor alpha and beta sequences."](https://pubmed.ncbi.nlm.nih.gov/26290413/) Sci. Transl. Med. 7, 301ra131 (2015)

View File

@@ -31,54 +31,23 @@ public class PlateFileReader {
BufferedReader reader = Files.newBufferedReader(Path.of(filename)); BufferedReader reader = Files.newBufferedReader(Path.of(filename));
CSVParser parser = new CSVParser(reader, plateFileFormat); CSVParser parser = new CSVParser(reader, plateFileFormat);
){ ){
//old code for wells as rows
// for(CSVRecord record: parser.getRecords()) {
// List<Integer[]> well = new ArrayList<>();
// for(String s: record) {
// if(!"".equals(s)) {
// String[] intString = s.replaceAll("\\[", "")
// .replaceAll("]", "")
// .replaceAll(" ", "")
// .split(",");
// //System.out.println(intString);
// Integer[] arr = new Integer[intString.length];
// for (int i = 0; i < intString.length; i++) {
// arr[i] = Integer.valueOf(intString[i]);
// }
// well.add(arr);
// }
// }
// wells.add(well);
for(CSVRecord record: parser.getRecords()) { for(CSVRecord record: parser.getRecords()) {
if (wells.size() == 0) { List<Integer[]> well = new ArrayList<>();
int num = 0;
for(String s: record) { for(String s: record) {
num++; if(!"".equals(s)) {
} String[] intString = s.replaceAll("\\[", "")
for (int i = 0; i < num; i++) {
wells.add(new ArrayList<>());
}
} else {
int i = 0;
for (String s : record) {
if (!"".equals(s)) { //if value isn't the empty string
//get rid of brackets, split at commas into a string array
String[] intsAsStrings = s.replaceAll("\\[", "")
.replaceAll("]", "") .replaceAll("]", "")
.replaceAll(" ", "") .replaceAll(" ", "")
.split(","); .split(",");
//Make Integer array with the same values //System.out.println(intString);
Integer[] arr = new Integer[intsAsStrings.length]; Integer[] arr = new Integer[intString.length];
for (int j = 0; j < intsAsStrings.length; j++) { for (int i = 0; i < intString.length; i++) {
arr[j] = Integer.valueOf(intsAsStrings[j]); arr[i] = Integer.valueOf(intString[i]);
} }
//Add Integer array to the correct well well.add(arr);
wells.get(i).add(arr);
i++;
} }
} }
wells.add(well);
}
} }
} catch(IOException ex){ } catch(IOException ex){
System.out.println("plate file " + filename + " not found."); System.out.println("plate file " + filename + " not found.");

View File

@@ -59,7 +59,7 @@ public class PlateFileWriter {
} }
} }
//this took forever, and I don't use it, because it makes reading data in a huge pain //this took forever
List<List<String>> rows = new ArrayList<>(); List<List<String>> rows = new ArrayList<>();
List<String> tmp = new ArrayList<>(); List<String> tmp = new ArrayList<>();
for(int i = 0; i < wellsAsStrings.size(); i++){//List<Integer[]> w: wells){ for(int i = 0; i < wellsAsStrings.size(); i++){//List<Integer[]> w: wells){
@@ -89,6 +89,7 @@ public class PlateFileWriter {
CSVPrinter printer = new CSVPrinter(writer, plateFileFormat); CSVPrinter printer = new CSVPrinter(writer, plateFileFormat);
){ ){
printer.printComment("Cell source file name: " + sourceFileName); printer.printComment("Cell source file name: " + sourceFileName);
printer.printComment("Each row represents one well on the plate.");
printer.printComment("Plate size: " + size); printer.printComment("Plate size: " + size);
printer.printComment("Error rate: " + error); printer.printComment("Error rate: " + error);
printer.printComment("Concentrations: " + concenString); printer.printComment("Concentrations: " + concenString);