1 module dparasail.cigar;
2 
3 /**
4 This module simplifies working with CIGAR strings/ops from Parasail alignment records.
5 Copied from dhtslib.
6 */
7 
8 import std.stdio;
9 import std.bitmanip : bitfields;
10 import std.array : join;
11 import std.algorithm : map;
12 import std.algorithm.iteration : each;
13 import std.conv : to;
14 import std.range : array;
15 import std.traits : isIntegral;
16 
17 import dparasail.log;
18 
19 /// Represents a CIGAR string
20 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
21 struct Cigar
22 {
23     /// array of distinct CIGAR ops 
24     /// private now to force fixing some reference issues
25     private CigarOp[] ops;
26 
27     /// Construct Cigar from raw data
28     this(uint* cigar, uint length)
29     {
30         this((cast(CigarOp*) cigar)[0 .. length]);
31     }
32 
33     /// Construct Cigar from an array of CIGAR ops
34     this(CigarOp[] ops)
35     {
36         this.ops = ops;
37     }
38 
39     /// null CIGAR -- don't read!
40     bool is_null()
41     {
42         return this.ops.length == 0;
43     }
44 
45     /// Format Cigar struct as CIGAR string in accordance with SAM spec
46     string toString()
47     {
48         return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join;
49     }
50 
51     /// get length of cigar
52     auto length(){
53         return this.ops.length;
54     }
55     
56     /// set length of cigar
57     void length(T)(T len)
58     if(isIntegral!T)
59     {
60         this.ops.length = len;
61     }
62 
63     /// copy cigar
64     auto const dup(){
65         return Cigar(this.ops.dup);
66     }
67 
68     /// return the alignment length expressed by this Cigar
69     @property int refBasesCovered()
70     {
71         int len;
72         foreach (op; this.ops)
73         {
74             // look ma no branching (ignore the above foreach)
75             // add op.length if reference consuming
76             // mask is 0 if not reference consuming
77             // mask is -1 if reference consuming
78             len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1));
79         }
80         return len;
81     }
82 
83     deprecated("Use camelCase names instead")
84     alias ref_bases_covered = refBasesCovered;
85     /// previous alignedLength function had a bug and 
86     /// it is just a duplicate of ref_bases_covered
87     alias alignedLength = refBasesCovered;
88 
89     /// allow foreach on Cigar
90     int opApply(int delegate(ref CigarOp) dg) {
91         int result = 0;
92 
93         foreach (op; ops) {
94             result = dg(op);
95 
96             if (result) {
97                 break;
98             }
99         }
100 
101         return result;
102     }
103     
104     /// Get a Slice of cigar ops from a range in the Cigar string
105     auto opSlice()
106     {
107         return ops;
108     }
109 
110     /// Get a Slice of cigar ops from a range in the Cigar string
111     auto opSlice(T)(T start, T end) if (isIntegral!T)
112     {
113         return ops[start .. end];
114     }
115 
116     /// Get a cigar op from a single position in the Cigar string
117     ref auto opIndex(ulong i)
118     {
119         return ops[i];
120     }
121 
122     /// Assign a cigar op at a single position in the Cigar string
123     auto opIndexAssign(CigarOp value, size_t index)
124     {
125         return ops[index] = value;
126     }
127 
128     /// Assign a range cigar ops over a range in the Cigar string
129     auto opSliceAssign(T)(CigarOp[] values, T start, T end)
130     {
131         assert(end - start == values.length);
132         return ops[start .. end] = values;
133     }
134 
135     auto opAssign(T: CigarOp[])(T value)
136     {
137         this.ops = value;
138         return this;
139     }
140 
141     auto opAssign(T: Cigar)(T value)
142     {
143         this.ops = value.ops;
144         return this;
145     }
146 
147     size_t opDollar()
148     {
149         return length;
150     }
151 
152     auto opBinary(string op)(const Cigar rhs) const
153     if(op == "~")
154     {
155         return Cigar(this.dup[] ~ rhs.dup[]);
156     }
157 }
158 
159 // Each pair of bits has first bit set iff the operation is query consuming,
160 // and second bit set iff it is reference consuming.
161 //                                            X  =  P  H  S  N  D  I  M
162 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
163 
164 /// Represents a distinct cigar operation
165 union CigarOp
166 {
167     /// raw opcode
168     uint raw;
169 
170     mixin(bitfields!( //lower 4 bits store op
171             Ops, "op", 4, //higher 28 bits store length
172             uint, "length", 28));
173 
174     /// construct Op from raw opcode
175     this(uint raw)
176     {
177         this.raw = raw;
178     }
179 
180     /// construct Op from an operator and operand (length)
181     this(uint len, Ops op)
182     {
183         this.op = op;
184         this.length = len;
185     }
186     /// Credit to Biod for this code below
187     /// https://github.com/biod/BioD from their bam.cigar module
188     /// True iff operation is one of M, =, X, I, S
189     bool isQueryConsuming() @property const nothrow @nogc
190     {
191         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
192     }
193 
194     /// True iff operation is one of M, =, X, D, N
195     bool isReferenceConsuming() @property const nothrow @nogc
196     {
197         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
198     }
199 
200     /// True iff operation is one of M, =, X
201     bool isMatchOrMismatch() @property const nothrow @nogc
202     {
203         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
204     }
205 
206     /// True iff operation is one of 'S', 'H'
207     bool isClipping() @property const nothrow @nogc
208     {
209         return ((raw & 0xF) >> 1) == 2; // 4 or 5
210     }
211 
212     deprecated("Use camelCase names instead")
213     alias is_query_consuming = isQueryConsuming;
214     deprecated("Use camelCase names instead")
215     alias is_reference_consuming = isReferenceConsuming;
216     deprecated("Use camelCase names instead")
217     alias is_match_or_mismatch = isMatchOrMismatch;
218     deprecated("Use camelCase names instead")
219     alias is_clipping = isClipping;
220 
221     string toString(){
222         return this.length.to!string ~ CIGAR_STR[this.op];
223     }
224 }
225 
226 /**
227 Represents all ops
228 #define BAM_CIGAR_STR   "MIDNSHP=XB"
229 #define BAM_CMATCH      0
230 #define BAM_CINS        1
231 #define BAM_CDEL        2
232 #define BAM_CREF_SKIP   3
233 #define BAM_CSOFT_CLIP  4
234 #define BAM_CHARD_CLIP  5
235 #define BAM_CPAD        6
236 #define BAM_CEQUAL      7
237 #define BAM_CDIFF       8
238 #define BAM_CBACK       9
239 */
240 string CIGAR_STR = "MIDNSHP=XB";
241 /// ditto
242 enum Ops
243 {
244     MATCH = 0,
245     INS = 1,
246     DEL = 2,
247     REF_SKIP = 3,
248     SOFT_CLIP = 4,
249     HARD_CLIP = 5,
250     PAD = 6,
251     EQUAL = 7,
252     DIFF = 8,
253     BACK = 9
254 }
255 
256 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
257 Cigar cigarFromString(string cigar)
258 {
259     import std.regex;
260 
261     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
262             m[2].to!char.charToOp)).array);
263 }
264 
265 /// Convert single char representing a CIGAR Op to Op union
266 Ops charToOp(char c)
267 {
268     foreach (i, o; CIGAR_STR)
269     {
270         if (c == o)
271         {
272             return cast(Ops) i;
273         }
274     }
275     return cast(Ops) 9;
276 }
277 
278 debug (dhtslib_unittest) unittest
279 {
280     writeln();
281     logInfo(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
282     string c = "130M2D40M";
283     auto cig = cigarFromString(c);
284     logInfo(__FUNCTION__, "Cigar:" ~ cig.toString());
285     assert(cig.toString() == c);
286     assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming);
287     assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming);
288 }
289 
290 /// Range-based iteration of a Cigar string
291 /// Returns a range of Ops that is the length of all op lengths
292 /// e.g. if the Cigar is 4M5D2I4M
293 /// CigarItr will return a range of MMMMDDDDDIIMMMM
294 /// Range is of the Ops enum not chars
295 struct CigarItr
296 {
297     Cigar cigar;
298     CigarOp current;
299 
300     this(Cigar c)
301     {
302         // Copy the cigar
303         cigar = c.dup;
304         current = cigar[0];
305         current.length = current.length - 1;
306     }
307 
308     Ops front()
309     {
310         return current.op;
311     }
312 
313     void popFront()
314     {
315         // import std.stdio;
316         // writeln(current);
317         // writeln(cigar);
318         if(cigar.length == 1 && current.length == 0)
319             cigar.length = 0;
320         else if (current.length == 0)
321         {
322             cigar = cigar[1 .. $];
323             current = cigar[0];
324             current.length = current.length - 1;
325         }
326         else if (current.length != 0)
327             current.length = current.length - 1;
328     }
329 
330     bool empty()
331     {
332         return cigar.length == 0;
333     }
334 }
335 
336 debug (dhtslib_unittest) unittest
337 {
338     import std.algorithm : map;
339 
340     writeln();
341     logInfo(__FUNCTION__, "Testing CigarItr");
342     string c = "7M2D4M";
343     auto cig = cigarFromString(c);
344     logInfo(__FUNCTION__, "Cigar:" ~ cig.toString());
345     auto itr = CigarItr(cig);
346     assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM");
347 }
348 
349 /// Coordinate pair representing aligned query and reference positions,
350 /// and the CIGAR op at or effecting their alignment.
351 struct AlignedCoordinate
352 {
353     long qpos, rpos;
354     Ops cigar_op;
355 }
356 
357 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string
358 struct AlignedCoordinatesItr
359 {
360     CigarItr itr;
361     AlignedCoordinate current;
362 
363     this(Cigar cigar)
364     {
365         itr = CigarItr(cigar);
366         current.qpos = current.rpos = -1;
367         current.cigar_op = itr.front;
368         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
369         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
370     }
371 
372     /// InputRange interface
373     AlignedCoordinate front()
374     {
375         return current;
376     }
377 
378     /// InputRange interface
379     void popFront()
380     {
381         itr.popFront;
382         current.cigar_op = itr.front;
383         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
384         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
385     }
386 
387     /// InputRange interface
388     bool empty()
389     {
390         return itr.empty;
391     }
392 }