Coverage for constants.py : 96%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Defines the constants that are used in the rest of the project.
4Such as the masses of aminoacids, supported modifications, length of the encodings,
5maximum length supported, labels and order of the encoded ions ...
7Greatly inspired/copied from:
8https://github.com/kusterlab/prosit/blob/master/prosit/constants.py
10And released under an Apache 2.0 license
11"""
13VAL_SPLIT = 0.8
15TOLERANCE_FTMS = 25
16TOLERANCE_ITMS = 0.35
17TOLERANCE_TRIPLETOF = 0.5
19TOLERANCE = {"FTMS": (25, "ppm"), "ITMS": (0.35, "da"), "TripleTOF": (50, "ppm")}
21CHARGES = [1, 2, 3, 4, 5, 6]
22DEFAULT_MAX_CHARGE = max(CHARGES)
23MAX_FRAG_CHARGE = 3
24MAX_SEQUENCE = 30
25MAX_TENSOR_SEQUENCE = 30 + 2
26MAX_ION = MAX_SEQUENCE - 1
27ION_TYPES = ["y", "b"]
28ION_TYPES = sorted(ION_TYPES)
30NLOSSES = ["", "H2O", "NH3"]
32FORWARD = {"a", "b", "c"}
33BACKWARD = {"x", "y", "z"}
35# Atomic elements
36PROTON = 1.007276467
37ELECTRON = 0.00054858
38H = 1.007825035
39C = 12.0
40O = 15.99491463
41N = 14.003074
43# Tiny molecules
44N_TERMINUS = H
45C_TERMINUS = O + H
46CO = C + O
47CHO = C + H + O
48NH2 = N + H * 2
49H2O = H * 2 + O
50NH3 = N + H * 3
52NEUTRAL_LOSS = {"NH3": NH3, "H2O": H2O}
54ION_OFFSET = {
55 "a": 0 - CHO,
56 "b": 0 - H,
57 "c": 0 + NH2,
58 "x": 0 + CO - H,
59 "y": 0 + H,
60 "z": 0 - NH2,
61}
63# Amino acids
64# Modifications use high caps PSI-MS name
65MODIFICATION = {
66 "CARBAMIDOMETHYL": 57.0214637236, # Carbamidomethylation (CAM)
67 "ACETYL": 42.010565, # Acetylation
68 "DEAMIDATED": 0.984016, # Deamidation
69 "OXIDATION": 15.99491, # Oxidation
70 "PHOSPHO": 79.966331, # Phosphorylation
71 "METHYL": 14.015650, # Methylation
72 "DIMETHYL": 28.031300, # Dimethylation
73 "TRIMETHYL": 42.046950, # Trimethylation
74 "FORMYL": 27.994915, # Formylation
75 "GG": 114.042927, # GlyGly ubiquitinylation residue
76 "LRGG": 383.228103, # LeuArgGlyGly ubiquitinylation residue
77 "NITRO": 44.985078, # Oxidation to nitro
78 "BIOTINYL": 226.077598, # Biotinilation
79}
81VARIABLE_MODS = {
82 "ACETYL": "Kn", # Acetylation
83 "BIOTINYL": "K", # Biotinilation
84 "DEAMIDATED": "RNQ", # Deamidation
85 "OXIDATION": "MP", # Oxidation
86 "PHOSPHO": "STY", # Phosphorylation
87 "METHYL": "KR", # Methylation
88 "DIMETHYL": "KR", # Dimethylation
89 "TRIMETHYL": "K", # Trimethylation
90 "FORMYL": "K", # Formylation
91 "GG": "K", # GlyGly ubiquitinylation residue
92 "NITRO": "Y", # Oxidation to nitro
93}
95MOD_INDICES = {v: i + 1 for i, v in enumerate(MODIFICATION)}
96# {'': 0, 'CARBAMIDOMETHYL': 1, 'ACETYL': 2, 'DEAMIDATED': 3, ...
98MOD_INDICES_S = {integer: char for char, integer in MOD_INDICES.items()}
99# {0: '', 1: 'CARBAMIDOMETHYL', 2: 'ACETYL',
101AMINO_ACID = {
102 "G": 57.021464,
103 "R": 156.101111,
104 "V": 99.068414,
105 "P": 97.052764,
106 "S": 87.032028,
107 "U": 150.95363,
108 "L": 113.084064,
109 "M": 131.040485,
110 "Q": 128.058578,
111 "N": 114.042927,
112 "Y": 163.063329,
113 "E": 129.042593,
114 "C": 103.009185 + MODIFICATION["CARBAMIDOMETHYL"],
115 "F": 147.068414,
116 "I": 113.084064,
117 "A": 71.037114,
118 "T": 101.047679,
119 "W": 186.079313,
120 "H": 137.058912,
121 "D": 115.026943,
122 "K": 128.094963,
123 "n": N_TERMINUS, # Placeholder to have n terminal modifications
124 "c": C_TERMINUS, # Placeholder to have c terminal modifications
125}
127AMINO_ACID_SET = set(AMINO_ACID)
128ALPHABET = {v: i + 1 for i, v in enumerate(sorted(AMINO_ACID))}
129# {'A': 1, 'C': 2, ... 'W': 20, 'Y': 21}
131ALPHABET_S = {integer: char for char, integer in ALPHABET.items()}
132# {1: 'A', 2: 'C', ..., 20: 'W', 21: 'Y'}
134AAS_NUM = len(ALPHABET)
136MOD_PEPTIDE_ALIASES = {
137 "C[160]": "", # This makes it so it assumes it is always modified
138 "C[+57]": "", # This makes it so it assumes it is always modified
139 "M(ox)": "OXIDATION",
140 "M[OXIDATION]": "OXIDATION",
141 "P[OXIDATION]": "OXIDATION", # Hydroxylation of proline
142 "S[PHOSPHO]": "PHOSPHO",
143 "Y[PHOSPHO]": "PHOSPHO",
144 "S[PHOS]": "PHOSPHO",
145 "T[PHOSPHO]": "PHOSPHO",
146 "T[PHOS]": "PHOSPHO",
147 "K[Acetyl]": "ACETYL",
148 "K[GlyGly]": "GG",
149 "K[156]": "FORMYL", # or "DIMETHYL",
150 "P[113]": "OXIDATION", # aka hydroxilation
151 "R[157]": "DEAMIDATED", # aka citrullinated
152 "n[43]": "ACETYL", # n-terminal acetylation
153 "n[ACETYL]": "ACETYL", # n-terminal acetylation
154}
156# Adds the cannonical names to the aliases, like K[GG]
157[
158 MOD_PEPTIDE_ALIASES.update(
159 {f"{mod_aa}[{mod_name}]": mod_name for mod_aa in mod_aminoacids}
160 )
161 for mod_name, mod_aminoacids in VARIABLE_MODS.items()
162]
164# This generages aliases like T[+80], M[+16.99], M[+16.9999]
165int_aliases = []
166for rounding_term in [0, 2, 4]:
167 for k, v in VARIABLE_MODS.items():
168 int_aliases.append(
169 {
170 aa + f"[+{round(MODIFICATION[k], rounding_term):.{rounding_term}f}]": k
171 for aa in v
172 }
173 )
175# This generates M[80] from M[+80]
176MASS_DIFF_ALIASES = {}
177_ = [MASS_DIFF_ALIASES.update(x) for x in int_aliases[::-1]]
178MASS_DIFF_ALIASES_I = {k[0] + f"[{v}]": k for k, v in MASS_DIFF_ALIASES.items()}
179MASS_DIFF_ALIASES_I.update({"C": "C[+57]"})
180MASS_DIFF_ALIASES_I.update({k: k for k in AMINO_ACID})
182MOD_PEPTIDE_ALIASES.update(MASS_DIFF_ALIASES)
183# This generages aliases like T[181]
184int_aliases = [
185 {aa + f"[{str(round(MODIFICATION[k] + AMINO_ACID[aa]))}]": k for aa in v}
186 for k, v in VARIABLE_MODS.items()
187]
188[MOD_PEPTIDE_ALIASES.update(x) for x in int_aliases[::-1]]
189del int_aliases
191MOD_AA_MASSES = AMINO_ACID.copy()
192MOD_AA_MASSES.update(
193 {
194 k: AMINO_ACID[k[0]] + MODIFICATION.get(v, 0)
195 for k, v in MOD_PEPTIDE_ALIASES.items()
196 }
197)
200ION_ENCODING_NESTING = ["CHARGE", "POSITION", "ION_TYPE"]
201ION_ENCODING_ITERABLES = {
202 "ION_TYPE": "".join(sorted(ION_TYPES)),
203 "CHARGE": [f"z{z}" for z in range(1, MAX_FRAG_CHARGE + 1)],
204 "POSITION": list(range(1, MAX_ION + 1)),
205}
206FRAG_EMBEDING_LABELS = []
208# TODO implement neutral losses ... if needed
209for charge in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[0]]:
210 for pos in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[1]]:
211 for ion in ION_ENCODING_ITERABLES[ION_ENCODING_NESTING[2]]:
212 key = f"{charge}{ion}{pos}"
213 FRAG_EMBEDING_LABELS.append(key)
215NUM_FRAG_EMBEDINGS = len(FRAG_EMBEDING_LABELS)
218IRT_PEPTIDES = {
219 "LGGNEQVTR": {"vendor": "biognosys", "irt": -24.92},
220 "GAGSSEPVTGLDAK": {"vendor": "biognosys", "irt": 0},
221 "VEATFGVDESNAK": {"vendor": "biognosys", "irt": 12.39},
222 "YILAGVENSK": {"vendor": "biognosys", "irt": 19.79},
223 "TPVISGGPYEYR": {"vendor": "biognosys", "irt": 28.71},
224 "TPVITGAPYEYR": {"vendor": "biognosys", "irt": 33.38},
225 "DGLDAASYYAPVR": {"vendor": "biognosys", "irt": 42.26},
226 "ADVTPADFSEWSK": {"vendor": "biognosys", "irt": 54.62},
227 "GTFIIDPGGVIR": {"vendor": "biognosys", "irt": 70.52},
228 "GTFIIDPAAVIR": {"vendor": "biognosys", "irt": 87.23},
229 "LFLQFGAQGSPFLK": {"vendor": "biognosys", "irt": 100},
230 "HEHISSDYAGK": {"vendor": "procal", "irt": -36.83},
231 "IGYDHGHIEHK": {"vendor": "procal", "irt": -33.5},
232 "TFAHTESHISK": {"vendor": "procal", "irt": -33.32},
233 "ISLGEHEGGGK": {"vendor": "procal", "irt": -18.54},
234 "YVGDSYDSSAK": {"vendor": "procal", "irt": -16.87},
235 "FGTGTYAGGEK": {"vendor": "procal", "irt": -9.35},
236 "LSSGYDGTSYK": {"vendor": "procal", "irt": -8.82},
237 "TASGVGGFSTK": {"vendor": "procal", "irt": -4.18},
238 "LTSGDFGEDSK": {"vendor": "procal", "irt": -3.76},
239 "AGDEALGDTYK": {"vendor": "procal", "irt": -3.52},
240 "SYASDFGSSAK": {"vendor": "procal", "irt": 1.79},
241 "LYSYYSSTESK": {"vendor": "procal", "irt": 6.39},
242 "FASDTSDEAFK": {"vendor": "procal", "irt": 7.2},
243 "LTDTFADDDTK": {"vendor": "procal", "irt": 8.25},
244 "LYTGAGYDEVK": {"vendor": "procal", "irt": 10.53},
245 "TLIAYDDSSTK": {"vendor": "procal", "irt": 14.98},
246 "TASEFDSAIAQDK": {"vendor": "procal", "irt": 17.84},
247 "HDLDYGIDSYK": {"vendor": "procal", "irt": 19.86},
248 "FLASSEGGFTK": {"vendor": "procal", "irt": 20.88},
249 "HTAYSDFLSDK": {"vendor": "procal", "irt": 25.9},
250 "FVGTEYDGLAK": {"vendor": "procal", "irt": 26.82},
251 "YALDSYSLSSK": {"vendor": "procal", "irt": 32},
252 "YYGTIEDTEFK": {"vendor": "procal", "irt": 33.73},
253 "GFLDYESTGAK": {"vendor": "procal", "irt": 35.9},
254 "HLTGLTFDTYK": {"vendor": "procal", "irt": 36.5},
255 "YFGYTSDTFGK": {"vendor": "procal", "irt": 41.42},
256 "HDTVFGSYLYK": {"vendor": "procal", "irt": 41.42},
257 "FSYDGFEEDYK": {"vendor": "procal", "irt": 44.22},
258 "ALFSSITDSEK": {"vendor": "procal", "irt": 44.88},
259 "LYLSEYDTIGK": {"vendor": "procal", "irt": 48.16},
260 "HFALFSTDVTK": {"vendor": "procal", "irt": 50.41},
261 "VSGFSDISIYK": {"vendor": "procal", "irt": 51.67},
262 "GSGGFTEFDLK": {"vendor": "procal", "irt": 51.97},
263 "TFTGTTDSFFK": {"vendor": "procal", "irt": 52.2},
264 "TFGTETFDTFK": {"vendor": "procal", "irt": 54.53},
265 "YTSFYGAYFEK": {"vendor": "procal", "irt": 56.65},
266 "LTDELLSEYYK": {"vendor": "procal", "irt": 57.66},
267 "ASDLLSGYYIK": {"vendor": "procal", "irt": 57.68},
268 "YGFSSEDIFTK": {"vendor": "procal", "irt": 57.77},
269 "HTYDDEFFTFK": {"vendor": "procal", "irt": 58.44},
270 "FLFTGYDTSVK": {"vendor": "procal", "irt": 61.07},
271 "GLSDYLVSTVK": {"vendor": "procal", "irt": 61.34},
272 "VYAETLSGFIK": {"vendor": "procal", "irt": 62.57},
273 "GLFYGGYEFTK": {"vendor": "procal", "irt": 62.96},
274 "GSTDDGFIILK": {"vendor": "procal", "irt": 63.07},
275 "TSIDSFIDSYK": {"vendor": "procal", "irt": 63.51},
276 "TLLLDAEGFEK": {"vendor": "procal", "irt": 65.49},
277 "GFVIDDGLITK": {"vendor": "procal", "irt": 66.46},
278 "GFEYSIDYFSK": {"vendor": "procal", "irt": 66.9},
279 "GIFGAFTDDYK": {"vendor": "procal", "irt": 71.49},
280 "LEIYTDFDAIK": {"vendor": "procal", "irt": 71.99},
281 "FTEGGILDLYK": {"vendor": "procal", "irt": 72.95},
282 "LLFSYSSGFVK": {"vendor": "procal", "irt": 73.23},
283 "STFFSFGDVGK": {"vendor": "procal", "irt": 74.29},
284 "LTAYFEDLELK": {"vendor": "procal", "irt": 75.09},
285 "VDTFLDGFSVK": {"vendor": "procal", "irt": 76.57},
286 "GASDFLSFAVK": {"vendor": "procal", "irt": 77.42},
287 "GEDLDFIYVVK": {"vendor": "procal", "irt": 79.62},
288 "VSSIFFDTFDK": {"vendor": "procal", "irt": 82.28},
289 "SILDYVSLVEKK": {"vendor": "procal", "irt": 83.05},
290 "VYGYELTSLFK": {"vendor": "procal", "irt": 87.89},
291 "GGFFSFGDLTK": {"vendor": "procal", "irt": 88.04},
292 "YDTAIDFGLFK": {"vendor": "procal", "irt": 89.4},
293 "IVLFELEGITK": {"vendor": "procal", "irt": 94.97},
294 "GIEDYYIFFAK": {"vendor": "procal", "irt": 95.37},
295 "SILDYVSLVEK": {"vendor": "procal", "irt": 96.26},
296 "AFSDEFSYFFK": {"vendor": "procal", "irt": 99.13},
297 "AFLYEIIDIGK": {"vendor": "procal", "irt": 99.61},
298}
301del charge
302del pos
303del ion
304del key
306if __name__ == "__main__":
307 # This is implemented so the constants can be printed if needed running this file directly
308 my_vars = {k: v for k, v in globals().items() if not k.startswith("_")}
309 for k, v in my_vars.items():
310 print(f"\n>>> {k} {type(v)} = {v}")