Source code for ymp.blast

"""
Parsers for blast output formats 6 (CSV) and 7 (CSV with comments between queries).
"""

import logging

from collections import namedtuple
from typing import List


log = logging.getLogger(__name__)  # pylint: disable=invalid-name


[docs]def reader(fileobj, t: int=7) -> 'BlastParser':
    """
    Creates a reader for files in BLAST format

    >>> with open(blast_file) as infile:
    >>>    reader = blast.reader(infile)
    >>>    for hit in reader:
    >>>       print(hit)

    Args:
      fileobj: iterable yielding lines in blast format
      t: number of blast format type
    """
    if t == 7:
        return Fmt7Parser(fileobj)
    elif t == 6:
        return Fmt6Parser(fileobj)
    else:
        raise ValueError("other formats not implemented")


[docs]class BlastParser(object):
    "Base class for BLAST parsers"

[docs]    def tupleofint(text):
        try:
            return tuple(int(i) for i in text.split(';'))
        except ValueError:
            log.warning("Error parsing BLAST file")
            return tuple()

    # Map between field short and long names
    FIELD_MAP = {
        "query acc.": "qacc",
        "subject acc.": "sacc",
        "% identity": "pident",
        "alignment length": "length",
        "mismatches": "mismatch",
        "gap opens": "gapopen",
        "q. start": "qstart",
        "q. end": "qend",
        "query length": "qlen",
        "s. start": "sstart",
        "s. end": "send",
        "evalue": "evalue",
        "bit score": "bitscore",
        "subject strand": "sstrand",
        "sbjct frame": "sframe",
        "score": "score",
        "query frame": "qframe",
        "subject title": "stitle",
        "subject tax ids": "staxids"
    }

    # Map defining types of fields
    FIELD_TYPE = {
        'pident': float,
        'length': int,
        'mismatch': int,
        'gapopen': int,
        'qstart': int,
        'qend': int,
        'qlen': int,
        'sstart': int,
        'send': int,
        'evalue': float,
        'bitscore': float,
        'score': float,
        'sframe': int,
        'qframe': int,
        'stitle': str,
        'staxids': tupleofint
    }


[docs]class Fmt7Parser(BlastParser):
    """
    Parses BLAST results in format '7' (CSV with comments)
    """
    FIELDS = "# Fields: "
    QUERY = "# Query: "
    DATABASE = "# Database: "
    HITSFOUND = " hits found"

    def __init__(self, fileobj):
        self.fileobj = fileobj
        self.fields = None
        if "BLAST" not in fileobj.readline():
            raise ValueError("not a BLAST7 formatted file")

[docs]    def get_fields(self) -> List[str]:
        """Returns list of available field names

        Format 7 specifies which columns it contains in comment lines, allowing
        this parser to be agnostic of the selection of columns made when running
        BLAST.

        Returns:
          List of field names (e.g. ``['sacc', 'qacc', 'evalue']``)
        """
        return self.fields

    def __iter__(self):
        for line in self.fileobj:
            if line.startswith(self.FIELDS):
                self.fields = [
                    self.FIELD_MAP[field]
                    if field in self.FIELD_MAP else field
                    for field in line[len(self.FIELDS):].strip().split(", ")
                ]
                self.Hit = namedtuple("BlastHit", self.fields)
            elif line.startswith(self.QUERY):
                self.query = line[len(self.QUERY):].strip()
            elif line.startswith(self.DATABASE):
                self.query = line[len(self.DATABASE):].strip()
            elif line.strip().endswith(self.HITSFOUND):
                self.hits = int(line.split()[1])
                self.hit = 0
            elif line[0] == "#":
                continue
            else:
                self.hit += 1
                yield self.Hit(*[
                    self.FIELD_TYPE[key](value)
                    if key in self.FIELD_TYPE else value
                    for key, value in zip(self.fields,
                                          line.strip().split('\t'))
                ])

[docs]    def isfirsthit(self) -> bool:
        """Returns `True` if the current hit is the first hit for the current
        query"""
        return self.hit == 1


[docs]class Fmt6Parser(BlastParser):
    """Parser for BLAST format 6 (CSV)
    """
    #: Default field types
    fields = ("qseqid sseqid pident length mismatch gapopen "
              "qstart qend sstart send evalue bitscore").split()
    field_types = [BlastParser.FIELD_TYPE.get(n, None) for n in fields ]
    Hit = namedtuple("BlastHit", fields)
    def __init__(self, fileobj):
        self.fileobj = fileobj

[docs]    def get_fields(self):
        return self.fields

    def __iter__(self):
        for line in self.fileobj:
            yield self.Hit(*[t(v) if t else v
                             for v, t in zip(line.split("\t"),
                                             self.field_types)])