#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright © 2020-2021 Matthew Smith, Ralf Weber
#
# This file is part of AMIMSpy.
#
# AMIMSpy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# AMIMSpy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with AMIMSpy. If not, see <https://www.gnu.org/licenses/>.
#
import re
from collections import OrderedDict
from dimspy.process.peak_filters import filter_attr
[docs]class Scans:
"""
The Scans class.
This class is used to extract high quality scan data from a given sample using a user defined method.
:param run: Spectral data from multiple samples contained in a single *.mzML file
:param well: Well label as provided in the corresponding metadata *.txt file
:param well_scans: Scan IDs for all scans in a given well
:param id_snr: User provided SNR threshold for differentiating between on and off scans
:param id_tol: User provided number of features with SNR > id_snr to tolerate in off scans for labelling the scan type
"""
def __init__(self, run, well, well_scans, id_snr, id_tol):
self.dat = run
self.well = well
self.id_snr = id_snr
self.id_tol = id_tol
self.pls = self.peaklists(well_scans)
# Dictionary of scan IDs in well and whether they are on/off scans
self.binary_dict = self.dictionary()
# String of the binary on/off scans in order. To search for on/off
# cycle patterns
self.binary_str = self.padding()
self.method = "on_scans_no_edge"
[docs] def peaklists(self, well_scans):
"""
Peak lists are generated for all scan IDs provided as input. The peak lists include the spectral data (mz, intensity, snr, flags) for each scan.
The peak lists havea hard SNR filter applied to diffeentiate between scan types - this is set to 15 by default.
:param method: well_scans: List of scan IDs from all scans in the given well.
:return: List of peaklist objects
"""
self.pls = self.dat.peaklists(well_scans, function_noise="median")
# adds snr flag col to data (mz, intensity, snr, flags)
self.pls = [filter_attr(pl, "snr", min_threshold=self.id_snr) if len(pl.mz) > 0 else pl for pl in self.pls]
return self.pls
[docs] def dictionary(self):
"""
A dictionary is generated using the scan IDs as keys and a binary identifier of the scan types as values (1 = 'on-scan' and 0 = 'off-scan').
The scan type is dertemined by the number of features with SNR above the applied SNR, by default >3 features needed to be labelled as 'on-scan'.
:return: Dictionary object
"""
si_binary = [(int(pl.ID), int(len(pl.mz) > self.id_tol)) for pl in self.pls]
self.binary_dict = OrderedDict(si_binary)
return self.binary_dict
[docs] def padding(self):
"""
Converts the binary values in the dictionary to a string of binary values and adds padding (00) to either side. This padding enables on/off cycles to be identified at the start and end of each well.
:return String object
"""
self.binary_str = "".join(map(str, self.binary_dict.values()))
self.binary_str = "00{}00".format(self.binary_str)
return self.binary_str
def _extract_all_scans(self):
"""
Extracts the scan IDs for all scans from the given well.
:return: List object
"""
scan_ids = self.binary_dict.keys()
return scan_ids
def _extract_off_scans(self):
"""
Extracts the scan IDs for all scans for only the off scans from the given well.
:return: List object
"""
scan_ids = self.extract("all_scans")
on_scan_ids = self.extract("on_scans")
for scan in scan_ids:
if scan in on_scan_ids:
scan_ids = filter(lambda a: a != scan, scan_ids)
return scan_ids
def _extract_on_scans(self, pattern_idx):
"""
Uses the dictionary of on/off cycle patterns to search the AMI-MS data and identify the on scans.
The scan IDs for only the on scans from the given well are returned.
:param pattern_idx: Dictionary of possible on/off scan cycles from AMI-MS data as keys and the indices
of the scans within each cycle to be extracted for the 'on_scans' method.
:return: List object
"""
scan_ids = []
for k in pattern_idx: # k is the iterator over binary patterns
for m in re.finditer(r'(?=(' + k + '))', self.binary_str):
start = m.start() # Scan to start searching for the pattern from
try:
if len(pattern_idx[k]) > 1:
# scan where binary pattern OF INTEREST starts (the
# first off scan) - 2 to account for synthetic off
# scans
idx_s = start + pattern_idx[k][0] - 2
# scan where binary pattern OF INTEREST ends
idx_e = start + pattern_idx[k][1] - 2
# Store scan IDs to process
scan_ids.extend(list(self.binary_dict.keys())[idx_s:idx_e])
# if only 1 index (for median on scans for instance)
elif len(pattern_idx[k]) == 1:
# scan of median on scan ('1') in binary pattern
idx = start + pattern_idx[k][0] - 2
# Store scan IDs to process
scan_ids.append(list(self.binary_dict.keys())[idx])
else:
raise Exception("Incorrect format")
except ValueError as e:
line = "Extract failed for well: {}, failed due to: {}.".format(self.well, e)
return line
scan_ids.sort()
return scan_ids
def _extract_on_scans_no_edge(self, pattern_idx):
"""
Uses the dictionary of on/off cycle patterns to search the AMI-MS data and identify the on scans that are not immediately preceded or followed by an off-scan.
For the unusual case of only two consecutive on scans, the single scan with the highest intensity is extracted.
The scan IDs for these scans from the given well are returned.
:param pattern_idx: Dictionary of possible on/off scan cycles from AMI-MS data as keys and the indices
of the scans within each cycle to be extracted for the 'on_scans_no_edge' method.
:return: List object
"""
scan_ids = []
for i in range(len(pattern_idx)):
# k is the iterator over binary patterns
k = [pattern for pattern in pattern_idx][i]
# Looks for pattern k in the list of binary patterns
for m in re.finditer(r'(?=(' + k + '))', self.binary_str):
start = m.start() # start scan
try:
# scan where binary pattern OF INTEREST starts (the
# first off scan) - 2 to account for synthetic off
# scans
idx_s = start + pattern_idx[k][0] - 2
# scan where binary pattern OF INTEREST ends
idx_e = start + pattern_idx[k][1] - 2
# If only two consecutive on scans
if i == 1:
on_scans = list(self.binary_dict.keys())[idx_s:idx_e]
d = {}
for pl in self.pls:
if int(pl.ID) in on_scans:
d[pl.ID] = pl.metadata.tic
# Take the single scan with the highest intensity
take_scan = int(max(d, key=d.get))
scan_ids.append(take_scan)
else:
# Store scan IDs to process
scan_ids.extend(list(self.binary_dict.keys())[idx_s:idx_e])
except ValueError as e:
line = "Extract failed for well: {}, failed due to: {}.".format(self.well, e)
return line
scan_ids.sort()
return scan_ids