Source code for dataset_creator.creator

from . import base_dataset
from . import tnt
from . import mega
from . import genbank_fasta
from .phylip import PhylipDatasetFooter
from .utils import convert_nexus_to_format
from .utils import make_dataset_header


[docs]class Creator(object): """ Create dataset and extra files for formats FASTA, NEXUS, PHYLIP, TNT and MEGA. We will create a NEXUS formatted dataset first and use BioPython tools to convert to FASTA and PHYLIP formats. Parameters: data (named tuple): containing: * gene_codes: list * number_chars: string * number_taxa: string * seq_records: list of SeqRecordExpanded objects * gene_codes_and_lengths format (str): NEXUS, PHYLIP, TNT, MEGA codon_positions (str): Can be 1st, 2nd, 3rd, 1st-2nd, ALL (default). partitioning (str): 'by gene', 'by codon position', '1st-2nd, 3rd' aminoacids (boolean): To create aminoacid sequences instead of returning nucleotides. degenerate (str): Method to degenerate nucleotide sequences, following Zwick et al. Can be ``S``, ``Z``, ``SZ`` and ``normal``. outgroup (str): voucher code to be used as outgroup for NEXUS and TNT files. Attributes: extra_dataset_str (str): Charset block in Phylip formatted datasets. Example: >>> dataset_creator = Creator(data, format='NEXUS', codon_positions='ALL', ... partitioning='by gene') >>> dataset_creator.dataset_str '#NEXUS blah blah ' """ def __init__(self, data, format=None, codon_positions=None, partitioning=None, aminoacids=None, degenerate=None, outgroup=None): self.warnings = [] self.data = data self.format = format self.codon_positions = codon_positions self.partitioning = partitioning self.aminoacids = aminoacids self.degenerate = degenerate self.outgroup = outgroup self.dataset_header = self.create_dataset_header() self.dataset_block = self.create_dataset_block() self.dataset_footer = self.create_dataset_footer() self.dataset_str = self.put_everything_together() self.extra_dataset_str = self.create_extra_dataset_file()
[docs] def create_dataset_header(self): return make_dataset_header(self.data, file_format=self.format, aminoacids=self.aminoacids)
[docs] def create_dataset_block(self): if self.format in ['NEXUS', 'PHYLIP', 'FASTA']: dataset_constructor = base_dataset.DatasetBlock(self.data, self.codon_positions, self.partitioning, self.aminoacids, self.degenerate, self.format) elif self.format == 'GenBankFASTA': dataset_constructor = genbank_fasta.GenBankFASTADatasetBlock(self.data, self.codon_positions, self.partitioning, aminoacids=self.aminoacids, degenerate=self.degenerate) elif self.format == 'MEGA': dataset_constructor = mega.MegaDatasetBlock(self.data, self.codon_positions, self.partitioning, aminoacids=self.aminoacids, degenerate=self.degenerate) else: # TNT dataset_constructor = tnt.TntDatasetBlock(self.data, self.codon_positions, self.partitioning, degenerate=self.degenerate, aminoacids=self.aminoacids, outgroup=self.outgroup) dataset_block = dataset_constructor.dataset_block() self.warnings = dataset_constructor.warnings return dataset_block
[docs] def create_extra_dataset_file(self): phylip_footer = PhylipDatasetFooter(self.data, codon_positions=self.codon_positions, partitioning=self.partitioning) return phylip_footer.make_charset_block()
[docs] def put_everything_together(self): header_and_datablock = '{0}\n\n{1}'.format(self.dataset_header, self.dataset_block) if self.format == 'NEXUS': return '{0}\n\n{1}'.format(header_and_datablock, self.dataset_footer) elif self.format == 'PHYLIP': self.extra_dataset_str = self.create_extra_dataset_file() return convert_nexus_to_format(header_and_datablock, 'phylip-relaxed') elif self.format == 'FASTA' and self.partitioning != '1st-2nd, 3rd': return convert_nexus_to_format(header_and_datablock, 'fasta') elif self.format == 'FASTA' and self.partitioning == '1st-2nd, 3rd': return self.dataset_block.replace(';\nEND;', '') elif self.format == 'GenBankFASTA': return self.dataset_block.replace(';\nEND;', '') elif self.format == 'TNT': return '{0}\n\n{1}'.format(self.dataset_header, self.dataset_block) else: # MEGA return '{0}\n\n{1}'.format(self.dataset_header, self.dataset_block)