Source code for dataset_creator.utils

# -*- coding: UTF-8 -*-
import six
if six.PY2:
    from StringIO import StringIO
else:
    from io import StringIO

import os
from collections import namedtuple
import uuid

from Bio import AlignIO

from .exceptions import WrongParameterFormat


[docs]def get_seq(seq_record, codon_positions, aminoacids=False, degenerate=None):
    """
    Checks parameters such as codon_positions, aminoacids... to return the
    required sequence as string.

    Parameters:
        seq_record (SeqRecordExpanded object):
        codon_positions (str):
        aminoacids (boolean):

    Returns:
        Namedtuple containing ``seq (str)`` and ``warning (str)``.
    """
    Sequence = namedtuple('Sequence', ['seq', 'warning'])

    if codon_positions not in [None, '1st', '2nd', '3rd', '1st-2nd', 'ALL']:
        raise WrongParameterFormat("`codon_positions` argument should be any of the following"
                                   ": 1st, 2nd, 3rd, 1st-2nd or ALL")
    if aminoacids:
        aa = seq_record.translate()
        if '*' in aa:
            warning = "Gene {0}, sequence {1} contains stop codons '*'".format(seq_record.gene_code,
                                                                               seq_record.voucher_code)
        else:
            warning = None
        return Sequence(seq=aa, warning=warning)

    if degenerate:
        return Sequence(seq=seq_record.degenerate(degenerate), warning=None)

    if codon_positions == '1st':
        return Sequence(seq=seq_record.first_codon_position(), warning=None)
    elif codon_positions == '2nd':
        return Sequence(seq=seq_record.second_codon_position(), warning=None)
    elif codon_positions == '3rd':
        return Sequence(seq=seq_record.third_codon_position(), warning=None)
    elif codon_positions == '1st-2nd':
        return Sequence(seq=seq_record.first_and_second_codon_positions(), warning=None)
    else:  # None and ALL
        return Sequence(seq=str(seq_record.seq), warning=None)


[docs]def convert_nexus_to_format(dataset_as_nexus, dataset_format):
    """
    Converts nexus format to Phylip and Fasta using Biopython tools.

    :param dataset_as_nexus:
    :param dataset_format:
    :return:
    """
    fake_handle = StringIO(dataset_as_nexus)
    nexus_al = AlignIO.parse(fake_handle, 'nexus')
    tmp_file = make_random_filename()
    AlignIO.write(nexus_al, tmp_file, dataset_format)
    dataset_as_fasta = read_and_delete_tmp_file(tmp_file)
    return dataset_as_fasta


[docs]def make_random_filename():
    return '{0}.txt'.format(uuid.uuid4().hex)


[docs]def read_and_delete_tmp_file(filename):
    with open(filename, "r") as handle:
        contents = handle.read()

    if os.path.isfile(filename):
        os.remove(filename)

    return contents


[docs]def make_dataset_header(data, file_format, aminoacids):
    """Creates the dataset header for NEXUS files from ``#NEXUS`` to ``MATRIX``.

    Parameters:
        data (namedtuple):    with necessary info for dataset creation.
        file_format (str):    TNT, PHYLIP, NEXUS, FASTA
        aminoacids (boolean): If ``aminoacids is True`` the header will show
                              ``DATATYPE=PROTEIN`` otherwise it will be ``DNA``.
    """
    if aminoacids:
        datatype = 'PROTEIN'
    else:
        datatype = 'DNA'

    if file_format in ['NEXUS', 'PHYLIP', 'FASTA']:
        header = """
#NEXUS

BEGIN DATA;
DIMENSIONS NTAX={0} NCHAR={1};
FORMAT INTERLEAVE DATATYPE={2} MISSING=? GAP=-;
MATRIX
""".format(data.number_taxa, data.number_chars, datatype)

    elif file_format == 'MEGA':
        return "#MEGA\n!TITLE title;"

    else:  # file_format: TNT
        if aminoacids:
            molecule_type = "prot"
        else:
            molecule_type = "dna"
        header = """
nstates {0};
xread
{1} {2}""".format(molecule_type, data.number_chars, data.number_taxa)

    return header.strip()