Source code for bioa.io.sam

# -*- coding: utf-8 -*-
"""
================================
Grab read data from SAM/BAM file
================================
"""

__all__ = ["reads_from_sam",
           "reads_from_sam_quick"]

__author__ = "\n".join(["Nicholas Mancuso (nick.mancuso@gmail.com)",
                       "Alex Artyomenko (artyomenkoav@gmail.com)"])
#   Copyright (C) 2011-2012 by
#   Nicholas Mancuso <nick.mancuso@gmail.com>
#   Alex Artyomenko <artyomenkoav@gmail.com>
#   All rights reserved.
#   BSD license.

import sys
from collections import defaultdict

MATCH = 0
INSERTION = 1
DELETION = 2
CLIPPING = 4

INSERT_CHAR = "-"
DELETE_CHAR = "-"


[docs]def reads_from_sam_quick(sam_file): """ Quickly parse aligned reads from SAM/BAM file. This only returns reads and does not attempt to pad. Parameters ---------- sam_file : pysam.SamFile The SAM/BAM file object. Returns ------- read_pos_iter : (str, int) iterable Iterable of read-pairs indicating the read and its starting position in the reference. """ for aligned_read in sam_file: if aligned_read.is_unmapped: continue yield (str(aligned_read.seq), aligned_read.pos) return
[docs]def reads_from_sam(sam_file, reference): """ Parse aligned reads from SAM/BAM file. Parameters ---------- sam_file : pysam.SamFile The SAM/BAM file object. reference : Bio.Seq The reference sequence used for alignment. Returns ------- align_start, align_stop, reads, ref : (int, int, list, Bio.Seq) The alignment start, alignment stop, list of reads (str, int), and extended reference. Each (str, int) pair represents the read and its starting position. """ from Bio.Seq import Seq align_start = sys.maxint ref_inserts = defaultdict(int) for aligned_read in sam_file: # skip unmapped reads if aligned_read.is_unmapped: continue insertions = _calculate_insertions(aligned_read) for pos in insertions: ref_inserts[pos] = max(ref_inserts[pos], insertions[pos]) # extend the reference new_ref = [] for idx, nucleotide in enumerate(reference): new_ref.append(nucleotide) if idx in ref_inserts: new_ref.append(INSERT_CHAR * ref_inserts[idx]) sam_file.reset() cache = defaultdict(int) align_start, align_stop, reads = _adjust_all_reads(sam_file, ref_inserts, cache) sam_file.close() return align_start, align_stop, reads, Seq("".join(new_ref))
def _adjust_all_reads(sam_file, ref_inserts, cache): """ Count the offset to give aligned start position. Adjust read and output it to ira format file. """ reads = [] align_start, align_stop = sys.maxint, 0 for aligned_read in sam_file: # skip unmapped reads if aligned_read.is_unmapped: continue offset = _cached_pos_offset(cache, ref_inserts, aligned_read.pos) adj_read = _adjust_read(aligned_read, ref_inserts) start = aligned_read.pos + offset + 1 align_start = min(align_start, start) align_stop = max(align_stop, start + len(adj_read)) reads.append((adj_read, start)) return align_start, align_stop, reads def _calculate_insertions(aligned_read): """ Calculate the insertion positions to add to the extended reference. Parameters ---------- aligned_read : pysam.AlignedRead The aligned read from a sam/bam file Returns ------- insertions : Iterable of insertion positions """ offset = 0 ins = defaultdict(int) cigar = aligned_read.cigar pos = aligned_read.pos for op, count in cigar: if op == INSERTION: # output insertion positions ins[pos + offset - 1] = max(ins[pos + offset - 1], count) elif op < 2: # need to offset for both MATCH and INSERTION offset += count return ins def _retrieve_insertion(new_read): """ Before going to insertion on cigar last insertion should be retrieved. """ # ? What is the point of this, to remove last insertions? tm = new_read.pop() if len(tm) > 0 and tm[0] != INSERT_CHAR: new_read.append(tm) def _adjust_read(aligned_read, inserts): """ Adjust the read to have extended characters represented by deleted positions. Parameters ---------- aligned_read : pysam.AlignedRead The aligned read to add deletion characters to. inserts : dict Dictionary of indicies indicating an insertion in the original reference. Returns ------- extended_read : Bio.Seq Extended sequence """ # offset on reference and read g_offset, l_offset = 0, 0 read = [] for op, count in aligned_read.cigar: pos = aligned_read.pos if op == DELETION or op == MATCH: # append the number of deletion chars # append the bases, but also check if position # is an insertion char in the extended ref for i in range(count): char = DELETE_CHAR if op == DELETION \ else aligned_read.seq[i + l_offset] read.append(char) cur_pos = i + g_offset + pos if cur_pos in inserts and inserts[cur_pos]: read.append(INSERT_CHAR * inserts[cur_pos]) g_offset += count elif op == INSERTION: # append the insertion bases _retrieve_insertion(read) read.append(aligned_read.seq[l_offset:l_offset + count]) ins = inserts[g_offset + pos - 1] if count < ins: read.append(INSERT_CHAR * (ins - count)) if op == MATCH or op == INSERTION or op == CLIPPING: l_offset += count elif op == DELETION: l_offset -= count return "".join(read) def _cached_pos_offset(cache, ref_inserts, pos): if pos - 1 in cache: return cache[pos - 1] val = 0 for i in range(pos): val += ref_inserts[i] cache[i] = val return cache[pos - 1]