Source code for metaspace_converter.to_anndata

from typing import Container, Optional

import numpy as np
import pandas as pd
from anndata import AnnData
from metaspace import SMInstance
from metaspace.sm_annotation_utils import (
    DEFAULT_DATABASE,
    DatasetNotFound,
    IsotopeImages,
    SMDataset,
)

from metaspace_converter.constants import (
    COL,
    METASPACE_KEY,
    OBS_INDEX_NAME,
    SPATIAL_KEY,
    VAR_INDEX_NAME,
    Shape2d,
)
from metaspace_converter.utils import (
    empty_columns_to_nan,
    has_optical_image,
    iter_property_paths,
    stringify_list_columns,
    transform_coords,
)


[docs] def metaspace_to_anndata( dataset: Optional[SMDataset] = None, dataset_id: Optional[str] = None, database: tuple[str, str] = None, fdr: Optional[float] = 0.1, use_tic: bool = False, metadata_as_obs: bool = False, add_optical_image: bool = False, sm: Optional[SMInstance] = None, **annotation_filter, ) -> AnnData: """ Downloads a METASPACE dataset to an :py:class:`AnnData` object. See: https://metaspace2020.eu/about Args: dataset: A METASPACE dataset instance. If not provided, the `dataset_id` must be given. dataset_id: The unique ID of a dataset on METASPACE, e.g. "2021-09-03_11h43m13s" database: A single METASPACE database given as a tuple of name and version. Usually it is displayed on METASPACE as "HMDB – v4" which corresponds to `("HMDB", "v4")`. fdr: Returns only annotations for which the false discovery rate is less or equal to this limit. use_tic: When True, the output values will be scaled by the total ion count per pixel and will be in 0.0 to 1.0 range. metadata_as_obs: Whether to store metadata in the `obs` dataframe instead of `uns`. For a single METASPACE dataset, metadata is the same for all pixels, so it would be duplicated for all `obs`. When combining multiple datasets, it would be preserved in `obs` but not in `uns`. add_optical_image: Whether to embed the optical image for SquidPy sm: Optionally a cached SMInstance annotation_filter: Additional keyword arguments passed to the METASPACE API. Returns: An :py:class:`AnnData` object with * ion intensities: ``.X`` * ion image pixel coordinates: ``.obs[["ion_image_pixel_x", "ion_image_pixel_y"]]`` * spatial coordinates: ``.obsm["spatial"]`` * ion properties: ``.var``, for example "formula", "adduct", "mz", "fdr", "moleculeNames", "moleculeIds", "intensity"… * METASPACE metadata: ``.uns["metaspace"]`` if not ``metadata_as_obs`` * SquidPy metadata: ``.uns["spatial"]`` if ``add_optical_image`` Raises: ValueError: If something is wrong with the input data or parameters metaspace.sm_annotation_utils.DatasetNotFound: If the dataset ID is not found ConnectionError: For any other network connectivity problems """ # This must stay at the top above `locals()`, so that `parameters` includes no later defined # variables, but `dataset_id` definitely has a value. if dataset is not None: dataset_id = dataset.id # Capture the function parameters (except non-serializable ones) so we can include them # in the resulting AnnData. parameters = _dict_without_keys(locals(), keys=("sm", "dataset")) dataset = get_dataset(dataset=dataset, dataset_id=dataset_id, sm=sm) if dataset.status != "FINISHED": raise DatasetNotFound(f"Dataset with ID {dataset.id} has not finished processing.") if database is None: database = DEFAULT_DATABASE # Download annotations annotations = dataset.results( database=database, fdr=fdr, include_chem_mods=True, include_neutral_losses=True, **annotation_filter, ) annotations = _add_annotations_index(annotations, index_name=VAR_INDEX_NAME) annotations = _normalize_annotations_for_serialization(annotations) # Download ion images isotope_images = dataset.all_annotation_images( only_first_isotope=True, fdr=fdr, database=database, scale_intensity="TIC" if use_tic else True, **annotation_filter, ) if not isotope_images: raise ValueError( f"No isotope images available for dataset {dataset.id} and database " f"{database[0]}{database[1]}. Was the database selected for processing on METASPACE?" ) assert len(annotations) == len(isotope_images) # Sort isotope images to match the annotations. isotope_images = _sort_isotope_images_like(isotope_images, annotations) # Create X matrix (all ion pixels flattened to primary axis) shape = get_ion_image_shape(dataset) ion_intensities = _create_anndata_x(isotope_images, shape) # Create observations dataframe obs = _create_obs_with_coordinates(shape) # Scanpy looks up (x, y) coordinates from obsm. obsm = {SPATIAL_KEY: obs[[COL.ion_image_pixel_x, COL.ion_image_pixel_y]].values} adata = AnnData(X=ion_intensities, var=annotations, obs=obs, obsm=obsm) # Add metadata if not metadata_as_obs: adata.uns[METASPACE_KEY] = _create_uns_for_metaspace(dataset, parameters=parameters) else: metadata_df = _create_obs_for_metaspace( dataset, obs_names=adata.obs.index, parameters=parameters ) adata.obs = pd.concat([obs, metadata_df], axis=1) # Add optical image if add_optical_image and has_optical_image(dataset): _add_obsm_and_uns_for_squidpy( adata, dataset=dataset, pixel_coords_xy=obs[[COL.ion_image_pixel_x, COL.ion_image_pixel_y]], ) return adata
[docs] def create_annotation_id( formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = "" ) -> str: return f"{formula}{adduct}{chem_mod}{neutral_loss}"
def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame: df = df.reset_index() df[index_name] = df.apply( lambda row: create_annotation_id( row.formula, row.adduct, getattr(row, "chemMod", ""), getattr(row, "neutralLoss", "") ), axis=1, ) return df.set_index(index_name) def _normalize_annotations_for_serialization(df: pd.DataFrame) -> pd.DataFrame: # AnnData cannot save dataframes containing Python objects (like lists). # As workaround, we stringify to JSON. df = stringify_list_columns(df, columns=["isotopeImages", "moleculeNames", "moleculeIds"]) # AnnData can also not save columns containing only None (dtype object, tries to save as str) # "Can't implicitly convert non-string objects to strings" return empty_columns_to_nan(df)
[docs] def get_dataset( dataset: Optional[SMDataset] = None, dataset_id: Optional[str] = None, sm: Optional[SMInstance] = None, ) -> SMDataset: """ Fetch a dataset instance, if not provided """ if dataset is not None: return dataset else: if dataset_id is None: raise ValueError("Either `dataset` or `dataset_id` must be provided") if sm is None: sm = SMInstance() return sm.dataset(id=dataset_id)
[docs] def get_ion_image_shape( dataset: Optional[SMDataset] = None, dataset_id: Optional[str] = None, sm: Optional[SMInstance] = None, ) -> Shape2d: """ Get the shape of a METASPACE dataset (in Numpy order). Args: dataset: A METASPACE dataset instance. If not provided, a dataset ID must be provided. dataset_id: An optional METASPACE dataset ID, if no dataset instance is provided. sm: An optional SMInstance, otherwise one will be created. """ dataset = get_dataset(dataset=dataset, dataset_id=dataset_id, sm=sm) return dataset.image_size["y"], dataset.image_size["x"]
def _sort_isotope_images_like( isotope_images: list[IsotopeImages], df: pd.DataFrame ) -> list[IsotopeImages]: images_dict = { (img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images } # Return them in the requested order. # Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows. return [ images_dict[(row.formula, row.adduct, row.chemMod, row.neutralLoss)] for row in df.itertuples(index=False) ] def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray: n_pixels = np.prod(shape) n_ions = len(isotope_images) y_x_ion_stack = np.dstack([_isotope_image_to_array(img, shape=shape) for img in isotope_images]) return y_x_ion_stack.reshape(n_pixels, n_ions) def _isotope_image_to_array(ion_image: IsotopeImages, shape: Shape2d) -> np.ndarray: if len(ion_image) == 0 or ion_image[0] is None: return np.zeros(shape=shape) else: return ion_image[0] def _create_obs_with_coordinates(shape: Shape2d) -> pd.DataFrame: n_pixels = np.prod(shape) pixel_coords_yx = all_image_pixel_coordinates(shape) return pd.DataFrame( { COL.ion_image_pixel_x: pixel_coords_yx[:, 1], COL.ion_image_pixel_y: pixel_coords_yx[:, 0], COL.ion_image_shape_y: shape[0], COL.ion_image_shape_x: shape[1], }, index=pd.RangeIndex(n_pixels, name=OBS_INDEX_NAME).astype(str), )
[docs] def all_image_pixel_coordinates(shape: Shape2d) -> np.ndarray: """ Returns a list of Y, X coordinates (Numpy order) for every pixel of an image. Args: shape: The shape of a 2D image Returns: Array of shape (number of pixels, 2) """ ys_and_xs = np.indices(shape) return np.dstack(ys_and_xs).reshape((-1, len(shape)))
def _create_default_spatial_uns( image: Optional[np.ndarray] = None, scale_factor: float = 1.0, spot_diameter: float = 1, library_id: str = "image", image_key: str = "hires", ) -> dict: return { library_id: { "images": {image_key: image} if image is not None else {}, "scalefactors": { f"tissue_{image_key}_scalef": scale_factor, "spot_diameter_fullres": spot_diameter, }, } } def _create_uns_for_metaspace(dataset: SMDataset, parameters: Optional[dict] = None) -> dict: # Database details contain instances of MolecularDB, which are not JSON-serializable. # Also, AnnData does not support lists of dicts, so we have to convert to dict of dicts. # See issue https://github.com/scverse/anndata/issues/708 plain_db_details = {str(idx): db._info for idx, db in enumerate(dataset.database_details)} plain_projects = {str(idx): project for idx, project in enumerate(dataset.projects)} plain_parameters = {k: list(v) if isinstance(v, tuple) else v for k, v in parameters.items()} return { "id": dataset.id, "name": dataset.name, "adducts": dataset.adducts, "databases": dataset.databases, "database_details": plain_db_details, "polarity": dataset.polarity, "image_size": dataset.image_size, "submitter": dataset.submitter, "group": dataset.group, "principal_investigator": dataset.principal_investigator, "projects": plain_projects, "status": dataset.status, "metadata": dict(dataset.metadata), "config": dataset.config, # Include parameters used to generate this AnnData f"{metaspace_to_anndata.__name__}_parameters": plain_parameters, } def _create_obs_for_metaspace( dataset: SMDataset, obs_names: pd.Index, parameters: Optional[dict] = None ) -> pd.DataFrame: metadata = _create_uns_for_metaspace(dataset=dataset, parameters=parameters) flattened_metadata = iter_property_paths( obj={METASPACE_KEY: metadata}, delimiter=".", dont_traverse_types=(tuple, list), dont_descend_paths=[f"{METASPACE_KEY}.{metaspace_to_anndata.__name__}_parameters"], ) metadata_series = pd.Series(dict(flattened_metadata)) obs = pd.DataFrame([metadata_series] * len(obs_names), index=obs_names) # METASPACE metadata contains a lot of list values. # AnnData cannot save dataframes containing Python objects (like lists). # As workaround, we stringify to JSON. obs = stringify_list_columns(obs, all=True) return empty_columns_to_nan(obs) def _add_obsm_and_uns_for_squidpy(adata: AnnData, dataset: SMDataset, pixel_coords_xy: np.ndarray): optical_images = dataset.optical_images() image_yx_rgb = optical_images[0] # Transformation from ion image coordinate system to optical image (x, y order!). matrix_xy = optical_images._transforms[0] coords_on_optical_xy = transform_coords(pixel_coords_xy, matrix_xy) # SquidPy looks up (x, y) coordinates from obsm. adata.obsm[SPATIAL_KEY] = coords_on_optical_xy # SquidPy uses spatial information from uns. adata.uns[SPATIAL_KEY] = _create_default_spatial_uns(image=image_yx_rgb, spot_diameter=0.5) def _dict_without_keys(dictionary: dict, keys: Container) -> dict: # Exclude keys from a dictionary return {k: v for k, v in dictionary.items() if k not in keys}