# -*- coding: utf-8 -*-
from Bio.Seq import reverse_complement
from pydna.dseqrecord import Dseqrecord as _Dseqrecord
import re
import itertools as _itertools
from Bio.SeqFeature import SimpleLocation, SeqFeature
from pydna.utils import shift_location
from pydna.sequence_regex import compute_regex_site, dseqrecord_finditer
raw_gateway_common = {
"attB1": "CHWVTWTGTACAAAAAANNNG",
"attB2": "CHWVTWTGTACAAGAAANNNG",
"attB3": "CHWVTWTGTATAATAAANNNG",
"attB4": "CHWVTWTGTATAGAAAANNNG",
"attB5": "CHWVTWTGTATACAAAANNNG",
"attL1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAANNNG",
"attL2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAANNNG",
"attL3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAANNNG",
"attL4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAANNNG",
"attL5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAANNNG",
"attR1": "CHWVTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attR2": "CHWVTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attR3": "CHWVTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attR4": "CHWVTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attR5": "CHWVTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"overlap_1": "twtGTACAAAaaa",
"overlap_2": "twtGTACAAGaaa",
"overlap_3": "twtGTATAATaaa",
"overlap_4": "twtGTATAGAaaa",
"overlap_5": "twtGTATACAaaa",
}
raw_gateway_sites_greedy = {
**raw_gateway_common,
"attP1": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attP2": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTACAAGAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attP3": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAATAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attP4": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATAGAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
"attP5": "VAAWWAWKRWTTTWWTTYGACTGATAGTGACCTGTWCGTYGMAACAVATTGATRAGCAATKMTTTYYTATAWTGHCMASTWTGTATACAAAAGYWGARCGAGAARCGTAARRTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATRCTGTAARACACAACATATBCAGTCV",
}
raw_gateway_sites_conservative = {
**raw_gateway_common,
"attP1": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
"attP2": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTACAAGAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
"attP3": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAATAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
"attP4": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATAGAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
"attP5": "AAAWWAWKRWTTTWWTTTGACTGATAGTGACCTGTTCGTTGCAACAMATTGATRAGCAATGCTTTYTTATAATGCCMASTTTGTATACAAAAGYWGAACGAGAARCGTAAARTGATATAAATATCAATATATTAAATTAGAYTTTGCATAAAAAACAGACTACATAATACTGTAAAACACAACATATSCAGTCACTATGAAYCAACTACTTAGATGGTATTAGTGACCTGTA",
}
gateway_sites_greedy = {
k: {
"forward_regex": compute_regex_site(v),
"reverse_regex": compute_regex_site(reverse_complement(v)),
"consensus_sequence": v,
}
for k, v in raw_gateway_sites_greedy.items()
}
gateway_sites_conservative = {
k: {
"forward_regex": compute_regex_site(v),
"reverse_regex": compute_regex_site(reverse_complement(v)),
"consensus_sequence": v,
}
for k, v in raw_gateway_sites_conservative.items()
}
# From snapgene - ask Valerie
primer_design_attB = {
"attB1": "ACAAGTTTGTACAAAAAAGCAGGCT",
"attB2": "ACCACTTTGTACAAGAAAGCTGGGT",
"attB3": "ACAACTTTGTATAATAAAGTTGTA",
"attB4": "ACAACTTTGTATAGAAAAGTTGTA",
"attB5": "ACAACTTTGTATACAAAAGTTGTA",
}
[docs]
def gateway_overlap(
seqx: _Dseqrecord, seqy: _Dseqrecord, reaction: str, greedy: bool
) -> list[tuple[int, int, int]]:
"""
Find gateway overlaps. If greedy is True, it uses a more greedy consensus site to find attP sites,
which might give false positives
"""
if reaction not in ["BP", "LR"]:
raise ValueError(f"Invalid overlap type: {reaction}")
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
out = list()
# Iterate over the four possible att sites
for num in range(1, 5):
# Iterate over the two possible orientations
# The sites have to be in the same orientation (fwd + fwd or rev + rev)
for pattern in ["forward_regex", "reverse_regex"]:
# The overlap regex is the same for all types
overlap_regex = gateway_sites[f"overlap_{num}"][pattern]
# Iterate over pairs B, P and P, B for BP and L, R and R, L for LR
for site_x, site_y in zip(reaction, reaction[::-1]):
site_x_regex = gateway_sites[f"att{site_x}{num}"][pattern]
matches_x = list(dseqrecord_finditer(site_x_regex, seqx))
if len(matches_x) == 0:
continue
site_y_regex = gateway_sites[f"att{site_y}{num}"][pattern]
matches_y = list(dseqrecord_finditer(site_y_regex, seqy))
if len(matches_y) == 0:
continue
for match_x, match_y in _itertools.product(matches_x, matches_y):
# Find the overlap sequence within each match, and use the
# core 7 pbs that are constant
overlap_x = re.search(overlap_regex, match_x.group())
overlap_y = re.search(overlap_regex, match_y.group())
# Sanity check
assert (
overlap_x is not None and overlap_y is not None
), "Something went wrong, no overlap found within the matches"
out.append(
(
match_x.start() + overlap_x.start() + 3,
match_y.start() + overlap_y.start() + 3,
7,
)
)
return out
[docs]
def find_gateway_sites(
seq: _Dseqrecord, greedy: bool
) -> dict[str, list[SimpleLocation]]:
"""Find all gateway sites in a sequence and return a dictionary with the name and positions of the sites."""
gateway_sites = gateway_sites_greedy if greedy else gateway_sites_conservative
out = dict()
for site in gateway_sites:
if not site.startswith("att"):
continue
for pattern in ["forward_regex", "reverse_regex"]:
matches = list(dseqrecord_finditer(gateway_sites[site][pattern], seq))
for match in matches:
if site not in out:
out[site] = []
strand = 1 if pattern == "forward_regex" else -1
loc = SimpleLocation(match.start(), match.end(), strand)
loc = shift_location(loc, 0, len(seq))
out[site].append(loc)
return out
[docs]
def annotate_gateway_sites(seq: _Dseqrecord, greedy: bool) -> _Dseqrecord:
sites = find_gateway_sites(seq, greedy)
for site in sites:
for loc in sites[site]:
seq.features.append(
SeqFeature(loc, type="protein_bind", qualifiers={"label": [site]})
)
return seq