I'm working on digital SSR of perennial plant whole chromatin DNA. I found the Interval type very useful in titrating out a chromosome segment containing a reasonable density of alleles of interest. The code ran remarkably fast and had no surprises. One thing I wished for was a built-in IntervalLength function, but perhaps this exists by another name in Geometry or elsewhere.
In the following example, a very long read chromosome assembly is downloaded from GenBank. This is followed by two tunable parameters "allelePositionsSpan" and "maxInitialSeqLen", and then by the alleles of interest.
fStart = AbsoluteTime[];
genbankData = ResourceFunction["ImportFASTA"]["CM019739.1"];
sequenceDescription = genbankData[[1, 1]];
forwardSequence = genbankData[[2, 1]];
fFinish = AbsoluteTime[];
Print[sequenceDescription];
Print["StringLength[forwardSequence] = ",
StringLength[forwardSequence]];
Print["elapsed download time = ", SetPrecision[(fFinish - fStart), 3],
" seconds"];
allelePositionsSpan = 16;
maxInitialSeqLen = 580;
Print["allelePositionsSpan = ", allelePositionsSpan];
Print["maxInitialSeqLen = ", maxInitialSeqLen];
allelesOfInterest = {"GTTT", "TAAT", "TGTT", "TTAA", "TTTA"};
allelesOfInterestCount = Length[allelesOfInterest];
Print["allelesOfInterest = ", allelesOfInterest];
iSstart = AbsoluteTime[];
initialSelections = Table[
Table[
If[#[[p + allelePositionsSpan, 2]] - #[[p, 1]] >
maxInitialSeqLen,
Nothing,
Interval[{#[[p, 1]], #[[p + allelePositionsSpan, 2]]}]],
{p, 1, Length[#] - allelePositionsSpan}
] &[SortBy[StringPosition[forwardSequence, allele], First]],
{allele, allelesOfInterest}
];
iSfinish = AbsoluteTime[];
Print["initialSelections dimensions = ", {allelesOfInterestCount,
Length[#] & /@ initialSelections}];
Print["initialSelections elapsed time = ",
SetPrecision[(iSfinish - iSstart), 3], " seconds"];
aHstart = AbsoluteTime[];
alleleHashSets =
Table[CreateDataStructure["HashSet"], {allele, allelesOfInterest}];
Do[
imax = Length[initialSelections[[h]]];
i = 1;
While[(i < imax),
mergedInterval = initialSelections[[h, i]];
notFound = True;
While[notFound \[And] (i < imax),
i++;
intersection =
IntervalIntersection[mergedInterval, initialSelections[[h, i]]];
If[Length[intersection] > 0,
mergedInterval =
IntervalUnion[mergedInterval, initialSelections[[h, i]]],
(* else *)
notFound = False;
];
{} (* end While body *)
];
alleleHashSets[[h]]["Insert", mergedInterval];
{} (* end While body *)
];
{},
{h, 1, allelesOfInterestCount}
];
Clear[initialSelections];
aHfinish = AbsoluteTime[];
Print["alleleHashSets dimensions = ", {allelesOfInterestCount,
Length[Normal[#]] & /@ alleleHashSets}];
Print["alleleHashSets elapsed time = ",
SetPrecision[(aHfinish - aHstart), 3], " seconds"];
vIstart = AbsoluteTime[];
vettedIntervals = Normal[alleleHashSets[[1]]]; (* initializer *)
Do[
vettedIntervalsHash = CreateDataStructure["HashSet"];
Do[
Do[
If[Length[IntervalIntersection[vI, aI]] > 0,
vettedIntervalsHash["Insert", IntervalUnion[vI, aI]]
],
{vI, vettedIntervals}
],
{aI, Normal[alleleHashSets[[a]]]}
];
vettedIntervals = Normal[vettedIntervalsHash];
Clear[vettedIntervalsHash];
{},
{a, 2, allelesOfInterestCount }
];
Clear[alleleHashSets];
vIfinish = AbsoluteTime[];
vettedIntervalsLength = Length[vettedIntervals];
Print["vettedIntervalsLength = ", vettedIntervalsLength];
Print["vettedIntervals elapsed time = ",
SetPrecision[(vIfinish - vIstart), 3], " seconds"];
If[vettedIntervalsLength > 0,
iMax = Min[vettedIntervalsLength, 5];
Print["vettedIntervals (", vettedIntervalsLength, "):"];
Do[
Print[i, " : ", vettedIntervals[[i]], ", length ",
vettedIntervals[[i, 1, 2]] - vettedIntervals[[i, 1, 1]]];
Print["\t",
"allele counts = ", (Length[
StringPosition[
StringTake[forwardSequence, vettedIntervals[[i, 1]]], #]] & /@
allelesOfInterest)],
{i, 1, iMax}
];
If[vettedIntervalsLength > iMax, Print["..."]];
];
Output:
allelePositionsSpan = 16
maxInitialSeqLen = 580
allelesOfInterest = {GTTT,TAAT,TGTT,TTAA,TTTA}
initialSelections dimensions = {5,{1159,4162,1186,6408,14353}}
initialSelections elapsed time = 3.52 seconds
alleleHashSets dimensions = {5,{177,648,168,975,1787}}
alleleHashSets elapsed time = 0.107 seconds
vettedIntervalsLength = 1
vettedIntervals elapsed time = 0.109 seconds
vettedIntervals (1):
1 : Interval[{13255103,13257036}], length 1933
allele counts = {32,27,30,25,47}