#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2023 by Björn Johansson. All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license. Please see the LICENSE.txt file that should have been included
# as part of this package.
"""Provides two functions, parse and parse_primers"""
import re
import io
import textwrap
import os
from Bio import SeqIO
from Bio.SeqIO.InsdcIO import GenBankScanner, GenBankIterator
import warnings
from pydna.dseqrecord import Dseqrecord
from Bio.SeqRecord import SeqRecord
from pydna.opencloning_models import UploadedFileSource
from pydna.primer import Primer
try:
from itertools import pairwise
except ImportError:
def pairwise(iterable):
# pairwise('ABCDEFG') → AB BC CD DE EF FG
iterator = iter(iterable)
a = next(iterator, None)
for b in iterator:
yield a, b
a = b
# "^>.+?^(?=$|LOCUS|ID|>|\#)|^(?:LOCUS|ID).+?^//"
# "(?:^>.+\n^(?:^[^>]+?)(?=\n\n|>|^LOCUS|ID))|(?:(?:^LOCUS|ID)(?:(?:.|\n)+?)^//)"
# gb_fasta_embl_regex = r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)"
gb_fasta_embl_regex = (
r"(?:^>.+\n^(?:^[^>]+?)(?=\n\n|>|^LOCUS|^ID))|(?:(?:^LOCUS|^ID)(?:(?:.|\n)+?)^//)"
)
# The gb_fasta_embl_regex is meant to be able to extract sequences from
# text where sequences are mixed with other contents as well
# use https://regex101.com to get an idea how it works.
[docs]
class CustomGenBankScanner(GenBankScanner):
"""
A custom GenBank scanner that parses the LOCUS line and extracts the name, size, molecule_type, topology and date
from malformed GenBank files.
For example, the following line:
```
LOCUS pKM265 4536 bp DNA circular SYN 21-JUN-2013
```
"""
def _feed_first_line(self, consumer, line):
# A regex for
m = re.match(
r"(?i)LOCUS\s+(?P<name>\S+)\s+(?P<size>\d+ bp)\s+(?P<molecule_type>\S+)(?:\s+(?P<topology>circular|linear))?(?:\s+.+\s+)?(?P<date>\d+-\w+-\d+)?",
line,
)
if m is None: # pragma: no cover - I don't think this can happen, but JIC
raise ValueError("LOCUS line cannot be parsed")
name, size, molecule_type, topology, date = m.groups()
consumer.locus(name)
consumer.size(size[:-3])
consumer.molecule_type(molecule_type)
consumer.topology(topology.lower() if topology is not None else None)
consumer.date(date)
[docs]
class CustomGenBankIterator(GenBankIterator):
def __init__(self, source):
super(GenBankIterator, self).__init__(source, fmt="GenBank")
self.records = CustomGenBankScanner(debug=0).parse_records(self.stream)
[docs]
def parse_genbank(handle: io.StringIO) -> SeqRecord:
"""
Equivalent to SeqIO.read(handle, "genbank") but with more permissive parsing for the LOCUS line. It also
removes BASE_COUNT lines that can be misplaced and anyway are not stored in the SeqRecord.annotations.
"""
filtered_lines = list()
for line in handle:
if not line.lstrip().startswith("BASE COUNT"):
filtered_lines.append(line)
filtered_handle = io.StringIO("".join(filtered_lines))
try:
# This may raise a ValueError such as "LOCUS line does not contain space at position X"
# then use the more permissive CustomGenBankIterator
parsed = SeqIO.read(filtered_handle, "genbank")
# Sometimes the line is enough to parse the record, but not enough to parse the topology,
# then raise an error starting with "LOCUS line does not contain" to trigger the
# more permissive CustomGenBankIterator
if "topology" not in parsed.annotations.keys():
raise ValueError("LOCUS line does not contain topology")
return parsed
except ValueError as e:
if "LOCUS line does not contain" not in str(e):
raise e
filtered_handle.seek(0)
warnings.warn(
"LOCUS line is wrongly formatted, we used a more permissive parser.",
stacklevel=2,
)
return next(CustomGenBankIterator(filtered_handle))
[docs]
def embl_gb_fasta(text):
"""Parse embl, genbank or fasta format from text.
Returns list of Bio.SeqRecord.SeqRecord
annotations["molecule_type"]
annotations["topology"]
"""
chunks, gaps = extract_from_text(text)
result_list = []
for chunk in chunks:
handle = io.StringIO(chunk)
first_line = chunk.splitlines()[0].lower().split()
try:
parsed = SeqIO.read(handle, "embl")
parsed.annotations["pydna_parse_sequence_file_format"] = "embl"
except ValueError:
handle.seek(0)
try:
parsed = parse_genbank(handle)
parsed.annotations["pydna_parse_sequence_file_format"] = "genbank"
except ValueError:
handle.seek(0)
try:
parsed = SeqIO.read(handle, "fasta-blast")
parsed.annotations["pydna_parse_sequence_file_format"] = "fasta"
except ValueError:
handle.close()
continue
else:
# molecule_type is not set by the Biopython FASTA parser
parsed.annotations["molecule_type"] = "DNA"
handle.close()
# hack to pick up topology from FASTA and malformed gb files
first_line = chunk.splitlines()[0].lower().split()
parsed.annotations["topology"] = "linear"
if "circular" in first_line:
parsed.annotations["topology"] = "circular"
molecule_type = parsed.annotations.get("molecule_type")
assert molecule_type, "molecule_type must be set"
assert molecule_type != "protein", "molecule_type can not be 'protein'"
result_list.append(parsed)
return tuple(result_list)
[docs]
def parse(data, ds=True, is_path=None) -> list[Dseqrecord | SeqRecord]:
"""Return *all* DNA sequences found in data.
If no sequences are found, an empty list is returned. This is a greedy
function, use carefully.
Parameters
----------
data : string or iterable
The data parameter is a string containing:
1. an absolute path to a local file.
The file will be read in text
mode and parsed for EMBL, FASTA
and Genbank sequences. Can be
a string or a Path object.
2. a string containing one or more
sequences in EMBL, GENBANK,
or FASTA format. Mixed formats
are allowed.
3. data can be a list or other iterable where the elements are 1 or 2
ds : bool
If True double stranded :class:`Dseqrecord` objects are returned.
If False single stranded :class:`Bio.SeqRecord` [#]_ objects are
returned.
is_path : bool, optional
If True, the data is treated as a path to a file. If False, the data is treated as a string (e.g. FASTA file content).
If None, both are tried.
Returns
-------
list
contains Dseqrecord or SeqRecord objects
References
----------
.. [#] http://biopython.org/wiki/SeqRecord
See Also
--------
read
"""
# a string is an iterable datatype but on Python2.x
# it doesn't have an __iter__ method.
if not hasattr(data, "__iter__") or isinstance(data, (str, bytes)):
data = (data,)
sequences = []
for item in data:
if is_path is None:
path = item if os.path.isfile(item) else None
elif is_path:
path = item
else:
path = None
if path:
with open(path, "r", encoding="utf-8") as f:
raw = f.read()
else:
raw = item
newsequences = embl_gb_fasta(raw)
for s in newsequences:
if ds and path:
from pydna.opencloning_models import UploadedFileSource
result = Dseqrecord.from_SeqRecord(s)
result.source = UploadedFileSource(
file_name=str(path), # we use str to handle PosixPath
sequence_file_format=s.annotations[
"pydna_parse_sequence_file_format"
],
index_in_file=0,
)
sequences.append(result)
elif ds:
sequences.append(Dseqrecord.from_SeqRecord(s))
else:
sequences.append(s)
return sequences
[docs]
def parse_primers(data):
"""docstring."""
return [Primer(x) for x in parse(data, ds=False)]
[docs]
def parse_snapgene(file_path: str) -> list[Dseqrecord]:
"""Parse a SnapGene file and return a Dseqrecord object.
Parameters
----------
file_path : str
The path to the SnapGene file to parse.
Returns
-------
Dseqrecord
The parsed SnapGene file as a Dseqrecord object.
"""
with open(file_path, "rb") as f:
parsed_seq = next(SeqIO.parse(f, "snapgene"))
circular = (
"topology" in parsed_seq.annotations.keys()
and parsed_seq.annotations["topology"] == "circular"
)
source = UploadedFileSource(
file_name=str(file_path),
sequence_file_format="snapgene",
index_in_file=0,
)
return [Dseqrecord(parsed_seq, circular=circular, source=source)]