Manuscript d Fructose Example ms_matplotlib#

This shows the code required to plot the d-fructose example in the manuscript. The mzML file is from the the Human Metabolome Database (ID: HMDB0000660)

This entry contains an annotated spectrum:

mz

formula

SMILES

45.0

CHO2

C(=O)O

61.0

C2H5O2

CC(=O)O

73.0

C2HO3

O=CC(=O)O

75.0

C2H3O3

OCC(=O)O

87.0

C3H3O3

O=CC(=O)CO

To generate a prediction using CFM-ID, you can use the SMILE or InChi key. This example used the SMILE.

SMILE: OC[C@H]1OC@(CO)C@@H[C@@H]1O InChi: InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1

For this tutorial CFM-ID and .mzML file are downloaded from the pyopenms_viz repository

import numpy as np
import pandas as pd
pd.options.plotting.backend = "ms_matplotlib"

Download example#

import requests
import zipfile
import io

# URL of the ZIP file
url = "https://github.com/OpenMS/pyopenms_viz/releases/download/manuscript/d_fructose_example.zip"

# Download the ZIP file
response = requests.get(url)
response.raise_for_status()  # Check for request errors

# Unzip the content
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
    zip_file.extractall("d_fructose_example")  # Extract to a directory
    print("Files extracted to 'd_fructose_example'")
Files extracted to 'd_fructose_example'

Helper Functions#

We will need to define the following helper methods below to help format/extract data and generate the molecular chemical drawings.

# Utility to import RDKit modules and disable logging
%%bash
pip install rdkit --quiet
import importlib
from PIL import Image, ImageOps
import rdkit.RDLogger as RDLogger
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import cairosvg
import io
import re
from typing import Tuple, Union, List

def rdkit_import(*module_strs: List[str]) -> Tuple:
    """Utility function to import RDKit modules and disable RDKit logging."""
    RDLogger.DisableLog('rdApp.*')
    return tuple(importlib.import_module(mod) for mod in module_strs)

def trim_img_by_white(img: Image.Image, padding: int = 0) -> Image.Image:
    """Crops the image to remove excess white space and adds padding."""
    as_array = np.array(img)
    assert as_array.ndim == 3 and as_array.shape[2] == 3, as_array.shape

    # Find non-white regions
    has_content = np.sum(as_array, axis=2, dtype=np.uint32) != 255 * 3
    xs, ys = np.nonzero(has_content)

    # Crop and pad
    x_range = max(min(xs) - 5, 0), min(max(xs) + 5, as_array.shape[0])
    y_range = max(min(ys) - 5, 0), min(max(ys) + 5, as_array.shape[1])
    cropped_img = as_array[x_range[0]:x_range[1], y_range[0]:y_range[1], :3]

    img_cropped = Image.fromarray(cropped_img, mode='RGB')
    return ImageOps.expand(img_cropped, border=padding, fill=(255, 255, 255))

def mol_from_smiles(smiles: str, standardize: bool = False) -> Union[Chem.Mol, None]:
    """Converts a SMILES string to an RDKit molecule."""
    Chem = rdkit_import("rdkit.Chem")[0]
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol and standardize:
            mol = rdkit_standardize(mol)
        return mol
    except Exception as e:
        print(f"Error converting SMILES to Mol: {e}")
        return None

def get_mol_im(smiles: str, width: int = 400, height: int = 200) -> Image.Image:
    """Generates an image of the molecule from its SMILES string."""
    mol = mol_from_smiles(smiles)
    if mol:
        d = rdMolDraw2D.MolDraw2DSVG(width, height)
        d.DrawMolecule(mol)
        d.FinishDrawing()
        svg_buf = d.GetDrawingText()

        png_buf = cairosvg.svg2png(bytestring=svg_buf)
        im = Image.open(io.BytesIO(png_buf))
        return trim_img_by_white(im, padding=15)
    else:
        return None

def parse_cfm_data(file_path: str, energy_section: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Parses CFM data for a specific energy section."""
    with open(file_path, 'r') as file:
        lines = file.readlines()

    mz_data, annotation_data = [], []
    capture_data, capture_annotations = False, False

    # Regex patterns
    energy_pattern = re.compile(f"^{energy_section}$")
    mz_pattern = re.compile(r"^(\d+\.\d+)\s+([\d\.]+)\s+([\d\s]+)\s+\(([\d\s\.]+)\)$")
    annotation_pattern = re.compile(r"^(\d+)\s+(\d+\.\d+)\s+(\S+)(?:\s+(.+))?$")

    for line in lines:
        line = line.strip()
        if energy_pattern.match(line):
            capture_data = True
            continue

        if capture_data and (line == "" or line.startswith("energy")):
            capture_data = False
            capture_annotations = True
            continue

        if capture_data:
            match = mz_pattern.match(line)
            if match:
                mz, rel_int, frag_id, probabilities = match.groups()
                mz_data.append([float(mz), float(rel_int), frag_id.strip(), probabilities.strip()])

        if capture_annotations:
            match = annotation_pattern.match(line)
            if match:
                frag_id, mz, smiles, note = match.groups()
                annotation_data.append([int(frag_id), float(mz), smiles, note])

    mz_df = pd.DataFrame(mz_data, columns=['mz', 'int', 'frag_id', 'probabilities'])
    annotation_df = pd.DataFrame(annotation_data, columns=['frag_id', 'mz', 'SMILES', 'note'])

    return mz_df, annotation_df

def explode_frag_ids_probabilities(df: pd.DataFrame) -> pd.DataFrame:
    """Explodes the 'frag_id' and 'probabilities' columns into multiple rows."""
    df = df.copy()
    df['frag_id'] = df['frag_id'].str.split()
    df['probabilities'] = df['probabilities'].str.split().apply(lambda x: [float(p) for p in x])

    return df.explode(['frag_id', 'probabilities']).astype({'frag_id': int})

def smiles_to_formula(smiles: str) -> Union[str, None]:
    """Converts a SMILES string to a molecular formula."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        return rdMolDescriptors.CalcMolFormula(mol) if mol else None
    except Exception as e:
        print(f"Error converting SMILES to formula: {e}")
        return None

def format_formula(formula: str) -> str:
    """Formats a molecular formula by replacing numbers with subscripts and handling charges."""
    if isinstance(formula, str):
        # Function to replace numbers with their subscript version
        def replace_with_subscript(match):
            return f"_{{{match.group()}}}"

        # Replace numbers with their subscript versions
        formatted = re.sub(r'\d+', replace_with_subscript, formula)

        # Replace '+' and '-' with superscript versions
        formatted = formatted.replace('+', '^+').replace('-', '^-')

        return formatted
    else:
        return formula

Load and Format Data#

Load .mzML using pymzml#

import pymzml

# Load the known annotated SMILES and formula into a dataframe
data = {
    "mz": [45.0, 61.0, 73.0, 75.0, 87.0],
    "formula": ["CHO2", "C2H5O2", "C2HO3", "C2H3O3", "C3H3O3"],
    "SMILES": ["C(=O)O", "CC(=O)O", "O=CC(=O)O", "OCC(=O)O", "O=CC(=O)CO"]
}
ann_df = pd.DataFrame(data)

# We can use pymzml to load the spectrum data from the mzML file

input_file = "./d_fructose_example/d_fructose_example/D-Fructose_lc_msms.mzML"

run = pymzml.run.Reader(input_file)
# Get the first spec
spec = run.next()
# Convert to Pandas DataFrame
df = pd.DataFrame(spec.centroidedPeaks).rename(columns={0:'mz', 1:'int'})
df['rel_int'] = df['int'] / df['int'].max() * 100
df['color_peak'] = "#1b9e77"
i=1
for mz in df['mz']:
    if np.floor(mz) in ann_df['mz'].values:
        # print(f"{ann_df[['mz']].isin([mz]).values.flatten()}")
        df.loc[df.mz==mz, ['custom_annotation']] = ann_df.loc[ann_df[['mz']].isin([mz]).values.flatten(), ['formula']].values[0][0]
        # Apply the formatting function to the 'formula' column
        df.loc[df.mz==mz, ['custom_annotation']] = df.loc[df.mz==mz]['custom_annotation'].apply(format_formula)
        i+=1
# replace the NaN values with empty strings in custom_annotation column
df['custom_annotation'] = df['custom_annotation'].fillna('')
df.head()
[Warning] Not index found and build_index_from_scratch is False
mz int rel_int color_peak custom_annotation
0 41.0 22.867 22.867 #1b9e77
1 45.0 47.956 47.956 #1b9e77 CHO_{2}
2 57.0 10.720 10.720 #1b9e77
3 59.0 11.557 11.557 #1b9e77
4 61.0 39.667 39.667 #1b9e77 C_{2}H_{5}O_{2}


Load CFM-ID#

For CFM-ID, you need to save the predictions out to a txt file using their export option. In this case, the prediction is saved to “cfm_d_fructose_inchi_prediction.txt”. The energy2 prediction corresponds to the 40V collision energy of our downloaded real spectrum.

pred_df, annotation_df = parse_cfm_data('./d_fructose_example/d_fructose_example/cfm_prediction/cfm_d_fructose_smile_prediction.txt', 'energy2')
pred_df['rel_int'] = pred_df['int'] / pred_df['int'].max() * 100
pred_df_long = explode_frag_ids_probabilities(pred_df)
pred_df['color_peak'] = "#d95f02"
i=1
for mz in pred_df['mz']:
    if np.floor(mz) in ann_df['mz'].values:
        pred_df.loc[pred_df.mz==mz, ['custom_annotation']] = f"f{i}"
        i+=1


# We also want to create a dataframe with the predictions for each fragment ion with their corresponding probabilities.
pred_ann_df = annotation_df[(annotation_df['frag_id'].isin( pred_df_long.frag_id.values.tolist()))]
# Apply the function to the SMILES column to create a new column 'formula'
pred_ann_df['formula'] = pred_ann_df['SMILES'].apply(smiles_to_formula)
pred_ann_df = pred_ann_df = pred_ann_df.loc[pred_ann_df['mz'].isin(pred_df.dropna().mz.values.tolist())]
pred_ann_df.sort_values('mz', inplace=True)
pred_ann_df = pd.merge(pred_ann_df, pred_df_long, on=["frag_id", 'mz'], how="inner")
pred_ann_df.sort_values(['mz', "probabilities"], inplace=True, ascending=[True, False])
# Apply the formatting function to the 'formula' column
pred_ann_df['formatted_formula'] = pred_ann_df['formula'].apply(format_formula)
/home/runner/work/pyopenms_viz/pyopenms_viz/docs/gallery_scripts/ms_matplotlib/plot_manuscript_d_fructose_spectrum_prediction_ms_matplotlib.py:275: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Create Figure#

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

fig = plt.figure(figsize=(19, 9), dpi=300)

# Create the gridspec layout: 1 row and 2 columns, with larger space on the left
gs = fig.add_gridspec(1, 2, width_ratios=[2.5, 2], wspace=0.05)

# Create the main spectrum subplot
ax1 = fig.add_subplot(gs[0, 0])

# Plot Mass Spectrum
p = df.plot(kind="spectrum", x="mz", y="rel_int", title="D-Fructose", ylabel="Relative Intensity",
        bin_peaks=False,
             reference_spectrum=pred_df[['mz', 'rel_int', 'color_peak']].assign(custom_annotation=""),
             mirror_spectrum=True,
             annotate_mz=False, annotate_top_n_peaks='all',
            #  annotation_color="color_annotation",
             custom_annotation="custom_annotation",
             peak_color="color_peak",
            annotation_font_size=16,
            xlabel_font_size=20,
            ylabel_font_size=20,
            width=1000, height=700, grid=False, fig=ax1)
ax1.set_xlim([39, 183])
ax1.set_ylim(-105, 105)
# Customize the y-axis ticks and labels
y_ticks = ax1.get_yticks()
ax1.set_yticks(y_ticks)
ax1.set_yticklabels([f'{abs(y):.0f}' for y in y_ticks])
# Update the title, xlabel, and ylabel
ax1.set_title("D-Fructose", fontsize=22)
ax1.set_xlabel("mass-to-charge", fontsize=20)
ax1.set_ylabel("Relative Intensity", fontsize=20)
# Update axis tick labels
ax1.tick_params(axis="x", labelsize=18)
ax1.tick_params(axis="y", labelsize=18)

font_size=16
leg_font_size = 0.9*font_size
leg_kws = {'ncol': 1, 'fontsize': leg_font_size, "loc": "upper left"}
handles = [
    patches.Patch(facecolor="#1b9e77",label="Real",alpha=1.0),
    patches.Patch(facecolor="#d95f02",label="Predicted",alpha=1.0),
]
leg = ax1.legend(handles=handles,**leg_kws)

mol_im = get_mol_im("OC[C@H]1O[C@](O)(CO)[C@@H](O)[C@@H]1O", 600, 500)
mol_im_arr = np.array(mol_im)
# Create an inset axes in the top right corner
axins = inset_axes(ax1,
                   width="40%", # width = 30% of parent_bbox
                   height="40%", # height : 30%
                   loc='upper right')

# Display the molecular image in the inset
axins.imshow(mol_im_arr)
axins.axis('off')  # Turn off axis labels


## Add predicted annotations

# Sort the DataFrame by m/z values
pred_df_sorted = pred_df.sort_values('mz').dropna()

# Define your custom shifts
x_shifts = [0, 0, -6, 10, 0]
y_shifts = [0, 0, 0, 0, 0]

# Base offsets
base_x_offset = -7
base_y_offset = -25

for i, row in enumerate(pred_df_sorted.itertuples(index=False)):
    mz, intensity, frag_id, probabilities, y, color_peak, text = row

    # Get the custom shifts for this annotation
    x_shift = x_shifts[i] if i < len(x_shifts) else 0
    y_shift = y_shifts[i] if i < len(y_shifts) else 0

    # Calculate the final offsets
    x_offset = base_x_offset + x_shift
    y_offset = base_y_offset + y_shift

    # Add the annotation
    ax1.annotate(
        text,
        xy=(mz, -y),  # The point to annotate
        xytext=(x_offset, y_offset),
        textcoords="offset points",
        fontsize=17,
        color="white",
        bbox={"boxstyle": "circle", "color":"#d95f02", 'fc':"#e57f1d"},
        arrowprops=dict(facecolor='grey'),
    )


# Right side: use gridspec with rows based on unique m/z values and 4 columns (1 for annotation, 3 for molecules)
unique_mz = pred_ann_df['mz'].unique()
num_rows = len(unique_mz)
gs_right = gs[0, 1].subgridspec(num_rows, 4, width_ratios=[2.5, 3, 3, 3])  # Increased width for annotation column
i = 1

# Create subplots in the grid on the right
for row, mz in enumerate(unique_mz):
    mz_df = pred_ann_df[pred_ann_df['mz'] == mz]

    # Add annotation column
    ax_annot = fig.add_subplot(gs_right[row, 0])

    # Add circular f# annotation
    ax_annot.annotate(f'f{i}', xy=(0.1, 0.5), xycoords='axes fraction',
                      fontsize=17, color="white",
                      bbox={"boxstyle": "circle", "color":"#d95f02", 'fc':"#e57f1d"})

    # Add formula annotation
    ax_annot.text(0.5, 0.5, r'${}$'.format(mz_df.iloc[0]["formatted_formula"]),
                  ha='left', va='center', fontsize=20)

    ax_annot.axis('off')

    # Add molecule images
    for col, (idx, row_data) in enumerate(mz_df.iterrows()):
        if col < 3:  # Limit to 3 columns
            ax = fig.add_subplot(gs_right[row, col+1])
            mol_im = get_mol_im(row_data['SMILES'], 200, 100)
            mol_im_arr = np.array(mol_im)
            ax.imshow(mol_im_arr)

            # Add probability text below the molecule
            prob = row_data['probabilities']
            ax.text(0.5, -0.1, f"Score: {prob:.4f}", transform=ax.transAxes,
                    ha='center', va='top', fontsize=18)

            ax.axis('off')
    i += 1


fig.tight_layout()
fig
D-Fructose
/home/runner/work/pyopenms_viz/pyopenms_viz/docs/gallery_scripts/ms_matplotlib/plot_manuscript_d_fructose_spectrum_prediction_ms_matplotlib.py:427: UserWarning:

This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.


<Figure size 5700x2700 with 16 Axes>

Total running time of the script: (0 minutes 1.860 seconds)

Gallery generated by Sphinx-Gallery