1 module dparasail.cigar; 2 3 /** 4 This module simplifies working with CIGAR strings/ops from Parasail alignment records. 5 Copied from dhtslib. 6 */ 7 8 import std.stdio; 9 import std.bitmanip : bitfields; 10 import std.array : join; 11 import std.algorithm : map; 12 import std.algorithm.iteration : each; 13 import std.conv : to; 14 import std.range : array; 15 import std.traits : isIntegral; 16 17 import dparasail.log; 18 19 /// Represents a CIGAR string 20 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 21 struct Cigar 22 { 23 /// array of distinct CIGAR ops 24 /// private now to force fixing some reference issues 25 private CigarOp[] ops; 26 27 /// Construct Cigar from raw data 28 this(uint* cigar, uint length) 29 { 30 this((cast(CigarOp*) cigar)[0 .. length]); 31 } 32 33 /// Construct Cigar from an array of CIGAR ops 34 this(CigarOp[] ops) 35 { 36 this.ops = ops; 37 } 38 39 /// null CIGAR -- don't read! 40 bool is_null() 41 { 42 return this.ops.length == 0; 43 } 44 45 /// Format Cigar struct as CIGAR string in accordance with SAM spec 46 string toString() 47 { 48 return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join; 49 } 50 51 /// get length of cigar 52 auto length(){ 53 return this.ops.length; 54 } 55 56 /// set length of cigar 57 void length(T)(T len) 58 if(isIntegral!T) 59 { 60 this.ops.length = len; 61 } 62 63 /// copy cigar 64 auto const dup(){ 65 return Cigar(this.ops.dup); 66 } 67 68 /// return the alignment length expressed by this Cigar 69 @property int refBasesCovered() 70 { 71 int len; 72 foreach (op; this.ops) 73 { 74 // look ma no branching (ignore the above foreach) 75 // add op.length if reference consuming 76 // mask is 0 if not reference consuming 77 // mask is -1 if reference consuming 78 len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1)); 79 } 80 return len; 81 } 82 83 deprecated("Use camelCase names instead") 84 alias ref_bases_covered = refBasesCovered; 85 /// previous alignedLength function had a bug and 86 /// it is just a duplicate of ref_bases_covered 87 alias alignedLength = refBasesCovered; 88 89 /// allow foreach on Cigar 90 int opApply(int delegate(ref CigarOp) dg) { 91 int result = 0; 92 93 foreach (op; ops) { 94 result = dg(op); 95 96 if (result) { 97 break; 98 } 99 } 100 101 return result; 102 } 103 104 /// Get a Slice of cigar ops from a range in the Cigar string 105 auto opSlice() 106 { 107 return ops; 108 } 109 110 /// Get a Slice of cigar ops from a range in the Cigar string 111 auto opSlice(T)(T start, T end) if (isIntegral!T) 112 { 113 return ops[start .. end]; 114 } 115 116 /// Get a cigar op from a single position in the Cigar string 117 ref auto opIndex(ulong i) 118 { 119 return ops[i]; 120 } 121 122 /// Assign a cigar op at a single position in the Cigar string 123 auto opIndexAssign(CigarOp value, size_t index) 124 { 125 return ops[index] = value; 126 } 127 128 /// Assign a range cigar ops over a range in the Cigar string 129 auto opSliceAssign(T)(CigarOp[] values, T start, T end) 130 { 131 assert(end - start == values.length); 132 return ops[start .. end] = values; 133 } 134 135 auto opAssign(T: CigarOp[])(T value) 136 { 137 this.ops = value; 138 return this; 139 } 140 141 auto opAssign(T: Cigar)(T value) 142 { 143 this.ops = value.ops; 144 return this; 145 } 146 147 size_t opDollar() 148 { 149 return length; 150 } 151 152 auto opBinary(string op)(const Cigar rhs) const 153 if(op == "~") 154 { 155 return Cigar(this.dup[] ~ rhs.dup[]); 156 } 157 } 158 159 // Each pair of bits has first bit set iff the operation is query consuming, 160 // and second bit set iff it is reference consuming. 161 // X = P H S N D I M 162 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 163 164 /// Represents a distinct cigar operation 165 union CigarOp 166 { 167 /// raw opcode 168 uint raw; 169 170 mixin(bitfields!( //lower 4 bits store op 171 Ops, "op", 4, //higher 28 bits store length 172 uint, "length", 28)); 173 174 /// construct Op from raw opcode 175 this(uint raw) 176 { 177 this.raw = raw; 178 } 179 180 /// construct Op from an operator and operand (length) 181 this(uint len, Ops op) 182 { 183 this.op = op; 184 this.length = len; 185 } 186 /// Credit to Biod for this code below 187 /// https://github.com/biod/BioD from their bam.cigar module 188 /// True iff operation is one of M, =, X, I, S 189 bool isQueryConsuming() @property const nothrow @nogc 190 { 191 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; 192 } 193 194 /// True iff operation is one of M, =, X, D, N 195 bool isReferenceConsuming() @property const nothrow @nogc 196 { 197 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; 198 } 199 200 /// True iff operation is one of M, =, X 201 bool isMatchOrMismatch() @property const nothrow @nogc 202 { 203 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; 204 } 205 206 /// True iff operation is one of 'S', 'H' 207 bool isClipping() @property const nothrow @nogc 208 { 209 return ((raw & 0xF) >> 1) == 2; // 4 or 5 210 } 211 212 deprecated("Use camelCase names instead") 213 alias is_query_consuming = isQueryConsuming; 214 deprecated("Use camelCase names instead") 215 alias is_reference_consuming = isReferenceConsuming; 216 deprecated("Use camelCase names instead") 217 alias is_match_or_mismatch = isMatchOrMismatch; 218 deprecated("Use camelCase names instead") 219 alias is_clipping = isClipping; 220 221 string toString(){ 222 return this.length.to!string ~ CIGAR_STR[this.op]; 223 } 224 } 225 226 /** 227 Represents all ops 228 #define BAM_CIGAR_STR "MIDNSHP=XB" 229 #define BAM_CMATCH 0 230 #define BAM_CINS 1 231 #define BAM_CDEL 2 232 #define BAM_CREF_SKIP 3 233 #define BAM_CSOFT_CLIP 4 234 #define BAM_CHARD_CLIP 5 235 #define BAM_CPAD 6 236 #define BAM_CEQUAL 7 237 #define BAM_CDIFF 8 238 #define BAM_CBACK 9 239 */ 240 string CIGAR_STR = "MIDNSHP=XB"; 241 /// ditto 242 enum Ops 243 { 244 MATCH = 0, 245 INS = 1, 246 DEL = 2, 247 REF_SKIP = 3, 248 SOFT_CLIP = 4, 249 HARD_CLIP = 5, 250 PAD = 6, 251 EQUAL = 7, 252 DIFF = 8, 253 BACK = 9 254 } 255 256 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 257 Cigar cigarFromString(string cigar) 258 { 259 import std.regex; 260 261 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 262 m[2].to!char.charToOp)).array); 263 } 264 265 /// Convert single char representing a CIGAR Op to Op union 266 Ops charToOp(char c) 267 { 268 foreach (i, o; CIGAR_STR) 269 { 270 if (c == o) 271 { 272 return cast(Ops) i; 273 } 274 } 275 return cast(Ops) 9; 276 } 277 278 debug (dhtslib_unittest) unittest 279 { 280 writeln(); 281 logInfo(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 282 string c = "130M2D40M"; 283 auto cig = cigarFromString(c); 284 logInfo(__FUNCTION__, "Cigar:" ~ cig.toString()); 285 assert(cig.toString() == c); 286 assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming); 287 assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming); 288 } 289 290 /// Range-based iteration of a Cigar string 291 /// Returns a range of Ops that is the length of all op lengths 292 /// e.g. if the Cigar is 4M5D2I4M 293 /// CigarItr will return a range of MMMMDDDDDIIMMMM 294 /// Range is of the Ops enum not chars 295 struct CigarItr 296 { 297 Cigar cigar; 298 CigarOp current; 299 300 this(Cigar c) 301 { 302 // Copy the cigar 303 cigar = c.dup; 304 current = cigar[0]; 305 current.length = current.length - 1; 306 } 307 308 Ops front() 309 { 310 return current.op; 311 } 312 313 void popFront() 314 { 315 // import std.stdio; 316 // writeln(current); 317 // writeln(cigar); 318 if(cigar.length == 1 && current.length == 0) 319 cigar.length = 0; 320 else if (current.length == 0) 321 { 322 cigar = cigar[1 .. $]; 323 current = cigar[0]; 324 current.length = current.length - 1; 325 } 326 else if (current.length != 0) 327 current.length = current.length - 1; 328 } 329 330 bool empty() 331 { 332 return cigar.length == 0; 333 } 334 } 335 336 debug (dhtslib_unittest) unittest 337 { 338 import std.algorithm : map; 339 340 writeln(); 341 logInfo(__FUNCTION__, "Testing CigarItr"); 342 string c = "7M2D4M"; 343 auto cig = cigarFromString(c); 344 logInfo(__FUNCTION__, "Cigar:" ~ cig.toString()); 345 auto itr = CigarItr(cig); 346 assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM"); 347 } 348 349 /// Coordinate pair representing aligned query and reference positions, 350 /// and the CIGAR op at or effecting their alignment. 351 struct AlignedCoordinate 352 { 353 long qpos, rpos; 354 Ops cigar_op; 355 } 356 357 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string 358 struct AlignedCoordinatesItr 359 { 360 CigarItr itr; 361 AlignedCoordinate current; 362 363 this(Cigar cigar) 364 { 365 itr = CigarItr(cigar); 366 current.qpos = current.rpos = -1; 367 current.cigar_op = itr.front; 368 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 369 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 370 } 371 372 /// InputRange interface 373 AlignedCoordinate front() 374 { 375 return current; 376 } 377 378 /// InputRange interface 379 void popFront() 380 { 381 itr.popFront; 382 current.cigar_op = itr.front; 383 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 384 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 385 } 386 387 /// InputRange interface 388 bool empty() 389 { 390 return itr.empty; 391 } 392 }