from typing import Container, Optional
import numpy as np
import pandas as pd
from anndata import AnnData
from metaspace import SMInstance
from metaspace.sm_annotation_utils import (
DEFAULT_DATABASE,
DatasetNotFound,
IsotopeImages,
SMDataset,
)
from metaspace_converter.constants import (
COL,
METASPACE_KEY,
OBS_INDEX_NAME,
SPATIAL_KEY,
VAR_INDEX_NAME,
Shape2d,
)
from metaspace_converter.utils import (
empty_columns_to_nan,
has_optical_image,
iter_property_paths,
stringify_list_columns,
transform_coords,
)
[docs]
def create_annotation_id(
formula: str, adduct: str, chem_mod: str = "", neutral_loss: str = ""
) -> str:
return f"{formula}{adduct}{chem_mod}{neutral_loss}"
def _add_annotations_index(df: pd.DataFrame, index_name: str = VAR_INDEX_NAME) -> pd.DataFrame:
df = df.reset_index()
df[index_name] = df.apply(
lambda row: create_annotation_id(
row.formula, row.adduct, getattr(row, "chemMod", ""), getattr(row, "neutralLoss", "")
),
axis=1,
)
return df.set_index(index_name)
def _normalize_annotations_for_serialization(df: pd.DataFrame) -> pd.DataFrame:
# AnnData cannot save dataframes containing Python objects (like lists).
# As workaround, we stringify to JSON.
df = stringify_list_columns(df, columns=["isotopeImages", "moleculeNames", "moleculeIds"])
# AnnData can also not save columns containing only None (dtype object, tries to save as str)
# "Can't implicitly convert non-string objects to strings"
return empty_columns_to_nan(df)
[docs]
def get_dataset(
dataset: Optional[SMDataset] = None,
dataset_id: Optional[str] = None,
sm: Optional[SMInstance] = None,
) -> SMDataset:
"""
Fetch a dataset instance, if not provided
"""
if dataset is not None:
return dataset
else:
if dataset_id is None:
raise ValueError("Either `dataset` or `dataset_id` must be provided")
if sm is None:
sm = SMInstance()
return sm.dataset(id=dataset_id)
[docs]
def get_ion_image_shape(
dataset: Optional[SMDataset] = None,
dataset_id: Optional[str] = None,
sm: Optional[SMInstance] = None,
) -> Shape2d:
"""
Get the shape of a METASPACE dataset (in Numpy order).
Args:
dataset: A METASPACE dataset instance. If not provided, a dataset ID must be provided.
dataset_id: An optional METASPACE dataset ID, if no dataset instance is provided.
sm: An optional SMInstance, otherwise one will be created.
"""
dataset = get_dataset(dataset=dataset, dataset_id=dataset_id, sm=sm)
return dataset.image_size["y"], dataset.image_size["x"]
def _sort_isotope_images_like(
isotope_images: list[IsotopeImages], df: pd.DataFrame
) -> list[IsotopeImages]:
images_dict = {
(img.formula, img.adduct, img.chem_mod, img.neutral_loss): img for img in isotope_images
}
# Return them in the requested order.
# Note: pd.DataFrame.itertuples yields NamedTuple and is faster than iterrows.
return [
images_dict[(row.formula, row.adduct, row.chemMod, row.neutralLoss)]
for row in df.itertuples(index=False)
]
def _create_anndata_x(isotope_images: list[IsotopeImages], shape: Shape2d) -> np.ndarray:
n_pixels = np.prod(shape)
n_ions = len(isotope_images)
y_x_ion_stack = np.dstack([_isotope_image_to_array(img, shape=shape) for img in isotope_images])
return y_x_ion_stack.reshape(n_pixels, n_ions)
def _isotope_image_to_array(ion_image: IsotopeImages, shape: Shape2d) -> np.ndarray:
if len(ion_image) == 0 or ion_image[0] is None:
return np.zeros(shape=shape)
else:
return ion_image[0]
def _create_obs_with_coordinates(shape: Shape2d) -> pd.DataFrame:
n_pixels = np.prod(shape)
pixel_coords_yx = all_image_pixel_coordinates(shape)
return pd.DataFrame(
{
COL.ion_image_pixel_x: pixel_coords_yx[:, 1],
COL.ion_image_pixel_y: pixel_coords_yx[:, 0],
COL.ion_image_shape_y: shape[0],
COL.ion_image_shape_x: shape[1],
},
index=pd.RangeIndex(n_pixels, name=OBS_INDEX_NAME).astype(str),
)
[docs]
def all_image_pixel_coordinates(shape: Shape2d) -> np.ndarray:
"""
Returns a list of Y, X coordinates (Numpy order) for every pixel of an image.
Args:
shape: The shape of a 2D image
Returns:
Array of shape (number of pixels, 2)
"""
ys_and_xs = np.indices(shape)
return np.dstack(ys_and_xs).reshape((-1, len(shape)))
def _create_default_spatial_uns(
image: Optional[np.ndarray] = None,
scale_factor: float = 1.0,
spot_diameter: float = 1,
library_id: str = "image",
image_key: str = "hires",
) -> dict:
return {
library_id: {
"images": {image_key: image} if image is not None else {},
"scalefactors": {
f"tissue_{image_key}_scalef": scale_factor,
"spot_diameter_fullres": spot_diameter,
},
}
}
def _create_uns_for_metaspace(dataset: SMDataset, parameters: Optional[dict] = None) -> dict:
# Database details contain instances of MolecularDB, which are not JSON-serializable.
# Also, AnnData does not support lists of dicts, so we have to convert to dict of dicts.
# See issue https://github.com/scverse/anndata/issues/708
plain_db_details = {str(idx): db._info for idx, db in enumerate(dataset.database_details)}
plain_projects = {str(idx): project for idx, project in enumerate(dataset.projects)}
plain_parameters = {k: list(v) if isinstance(v, tuple) else v for k, v in parameters.items()}
return {
"id": dataset.id,
"name": dataset.name,
"adducts": dataset.adducts,
"databases": dataset.databases,
"database_details": plain_db_details,
"polarity": dataset.polarity,
"image_size": dataset.image_size,
"submitter": dataset.submitter,
"group": dataset.group,
"principal_investigator": dataset.principal_investigator,
"projects": plain_projects,
"status": dataset.status,
"metadata": dict(dataset.metadata),
"config": dataset.config,
# Include parameters used to generate this AnnData
f"{metaspace_to_anndata.__name__}_parameters": plain_parameters,
}
def _create_obs_for_metaspace(
dataset: SMDataset, obs_names: pd.Index, parameters: Optional[dict] = None
) -> pd.DataFrame:
metadata = _create_uns_for_metaspace(dataset=dataset, parameters=parameters)
flattened_metadata = iter_property_paths(
obj={METASPACE_KEY: metadata},
delimiter=".",
dont_traverse_types=(tuple, list),
dont_descend_paths=[f"{METASPACE_KEY}.{metaspace_to_anndata.__name__}_parameters"],
)
metadata_series = pd.Series(dict(flattened_metadata))
obs = pd.DataFrame([metadata_series] * len(obs_names), index=obs_names)
# METASPACE metadata contains a lot of list values.
# AnnData cannot save dataframes containing Python objects (like lists).
# As workaround, we stringify to JSON.
obs = stringify_list_columns(obs, all=True)
return empty_columns_to_nan(obs)
def _add_obsm_and_uns_for_squidpy(adata: AnnData, dataset: SMDataset, pixel_coords_xy: np.ndarray):
optical_images = dataset.optical_images()
image_yx_rgb = optical_images[0]
# Transformation from ion image coordinate system to optical image (x, y order!).
matrix_xy = optical_images._transforms[0]
coords_on_optical_xy = transform_coords(pixel_coords_xy, matrix_xy)
# SquidPy looks up (x, y) coordinates from obsm.
adata.obsm[SPATIAL_KEY] = coords_on_optical_xy
# SquidPy uses spatial information from uns.
adata.uns[SPATIAL_KEY] = _create_default_spatial_uns(image=image_yx_rgb, spot_diameter=0.5)
def _dict_without_keys(dictionary: dict, keys: Container) -> dict:
# Exclude keys from a dictionary
return {k: v for k, v in dictionary.items() if k not in keys}