Source code for dataset_creator.dataset

#! encoding: utf-8
from collections import namedtuple
try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

from .creator import Creator
from .utils import get_seq


[docs]class Dataset(object): """User's class for making datasets of several formats. It needs as input a list of SeqRecord-expanded objects with as much info as possible: Parameters: seq_records (list): SeqRecordExpanded objects. The list should be sorted by gene_code and then voucher code. format (str): NEXUS, PHYLIP, TNT, MEGA, GenBankFASTA. partitioning (str): Partitioning scheme: ``by gene`` (default), ``by codon position`` (each) and ``1st-2nd, 3rd``. codon_positions (str): Can be ``1st``, ``2nd``, ``3rd``, ``1st-2nd``, ``ALL`` (default). aminoacids (boolean): Returns the dataset as aminoacid sequences. degenerate (str): Method to degenerate nucleotide sequences, following Zwick et al. Can be ``S``, ``Z``, ``SZ`` and ``normal``. outgroup (str): voucher code to be used as outgroup for NEXUS and TNT files. Attributes: _gene_codes_and_lengths (dict): in the form ``gene_code: list`` The list contains sequence lengths for its sequences. We assume the longest to be the real gene_code sequence length. Example: >>> dataset = Dataset(seq_records, format='NEXUS', codon_positions='1st', ... partitioning='by gene', ... ) >>> print(dataset.dataset_str) '#NEXUS blah blah ' >>> dataset = Dataset(seq_records, format='PHYLIP', codon_positions='ALL', ... partitioning='by gene', ... ) >>> print(dataset.dataset_str) '100 10 blah blah ' """ def __init__(self, seq_records, format=None, partitioning=None, codon_positions=None, aminoacids=None, degenerate=None, outgroup=None): self.warnings = [] self.seq_records = self.sort_seq_records(seq_records) self.gene_codes = None self.number_taxa = None self.number_chars = None self.reading_frames = {} self.format = format self.partitioning = partitioning self.codon_positions = codon_positions self.aminoacids = aminoacids self.degenerate = degenerate self.outgroup = None self._validate_codon_positions(codon_positions) self._validate_partitioning(partitioning) self._validate_outgroup(outgroup) self.data = None self._gene_codes_and_lengths = OrderedDict() self._prepare_data() self.extra_dataset_str = None self.dataset_str = self._create_dataset()
[docs] def sort_seq_records(self, seq_records): """Checks that SeqExpandedRecords are sorted by gene_code and then by voucher code. The dashes in taxon names need to be converted to underscores so the dataset will be accepted by Biopython to do format conversions. """ for seq_record in seq_records: seq_record.voucher_code = seq_record.voucher_code.replace("-", "_") unsorted_gene_codes = set([i.gene_code for i in seq_records]) sorted_gene_codes = list(unsorted_gene_codes) sorted_gene_codes.sort(key=lambda x: x.lower()) unsorted_voucher_codes = set([i.voucher_code for i in seq_records]) sorted_voucher_codes = list(unsorted_voucher_codes) sorted_voucher_codes.sort(key=lambda x: x.lower()) sorted_seq_records = [] for gene_code in sorted_gene_codes: for voucher_code in sorted_voucher_codes: for seq_record in seq_records: should_be_done = ( seq_record.gene_code == gene_code and seq_record.voucher_code == voucher_code ) if should_be_done: sorted_seq_records.append(seq_record) return sorted_seq_records
def _validate_partitioning(self, partitioning): if partitioning is None: self.partitioning = 'by gene' elif partitioning not in ['by gene', 'by codon position', '1st-2nd, 3rd']: raise AttributeError("Partitioning parameter should be one of these: " "None, 'by gene', 'by codon position', '1st-2nd, 3rd") elif partitioning in ['by codon position', '1st-2nd, 3rd'] \ and self.degenerate: raise ValueError("Cannot degenerate if partitions scheme is {0!r}".format( partitioning)) elif partitioning in ['by codon position', '1st-2nd, 3rd'] and self.format == 'MEGA': raise ValueError("Cannot produce MEGA dataset with codon positions in different partitions") def _validate_codon_positions(self, codon_positions): if codon_positions is None: self.codon_positions = 'ALL' elif codon_positions not in ['1st', '2nd', '3rd', '1st-2nd', 'ALL']: raise AttributeError("Codon positions parameter should be one of these: " "None, '1st', '2nd', '3rd', '1st-2nd', 'ALL'") def _validate_outgroup(self, outgroup): """All voucher codes in our datasets have dashes converted to underscores.""" if outgroup: outgroup = outgroup.replace("-", "_") good_outgroup = False for seq_record in self.seq_records: if seq_record.voucher_code == outgroup: good_outgroup = True break if good_outgroup: self.outgroup = outgroup else: raise ValueError("The given outgroup {0!r} cannot be found in the " "input sequence records.".format(outgroup)) else: self.outgroup = None def _prepare_data(self): """ Creates named tuple with info needed to create a dataset. :return: named tuple """ self._extract_genes() self._extract_total_number_of_chars() self._extract_number_of_taxa() self._extract_reading_frames() Data = namedtuple('Data', ['gene_codes', 'number_taxa', 'number_chars', 'seq_records', 'gene_codes_and_lengths', 'reading_frames']) self.data = Data(self.gene_codes, self.number_taxa, self.number_chars, self.seq_records, self._gene_codes_and_lengths, self.reading_frames) def _extract_genes(self): gene_codes = [i.gene_code for i in self.seq_records] unique_gene_codes = list(set(gene_codes)) # this is better: unique_gene_codes.sort(key=str.lower) # but will not work in python2 unique_gene_codes.sort(key=lambda x: x.lower()) self.gene_codes = unique_gene_codes def _extract_total_number_of_chars(self): """ sets `self.number_chars` to the number of characters as string. """ self._get_gene_codes_and_seq_lengths() sum = 0 for seq_length in self._gene_codes_and_lengths.values(): sum += sorted(seq_length, reverse=True)[0] self.number_chars = str(sum) def _get_gene_codes_and_seq_lengths(self): for seq_record in self.seq_records: if seq_record.gene_code not in self._gene_codes_and_lengths: self._gene_codes_and_lengths[seq_record.gene_code] = [] if self.aminoacids: seq = seq_record.translate() elif not self.aminoacids and self.degenerate is not None: seq = seq_record.degenerate(method=self.degenerate) else: sequence = get_seq(seq_record, self.codon_positions) seq = sequence.seq if sequence.warning: self.warnings.append(sequence.warning) self._gene_codes_and_lengths[seq_record.gene_code].append(len(seq)) def _extract_number_of_taxa(self): """ sets `self.number_taxa` to the number of taxa as string """ n_taxa = dict() for i in self.seq_records: if i.gene_code not in n_taxa: n_taxa[i.gene_code] = 0 n_taxa[i.gene_code] += 1 number_taxa = sorted([i for i in n_taxa.values()], reverse=True)[0] self.number_taxa = str(number_taxa) def _extract_reading_frames(self): for seq_record in self.seq_records: if seq_record.gene_code not in self.reading_frames: self.reading_frames[seq_record.gene_code] = seq_record.reading_frame def _create_dataset(self): creator = Creator(self.data, format=self.format, codon_positions=self.codon_positions, partitioning=self.partitioning, aminoacids=self.aminoacids, degenerate=self.degenerate, outgroup=self.outgroup, ) self.warnings = creator.warnings self.extra_dataset_str = creator.extra_dataset_str dataset_str = creator.dataset_str return dataset_str