Simulate CITE-seq data

Introduction

In this example, we show how to use scDesign3Py to simulate CITE-seq data and illustrate the similarity between the inputted reference data and synthetic data.

Import packages and Read in data

import pacakges

import anndata as ad
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import umap
import scDesign3Py

Read in the reference data

data = ad.read_h5ad("data/CITE.h5ad")

To save computational time, we only use the top 100 genes and six more genes with protein and RNA abundance information.

keep_gene = ["CD4", "CD14", "CD19", "CD34", "CD3E", "CD8A"]
keep_adt = ["ADT_CD4", "ADT_CD14", "ADT_CD19", "ADT_CD34", "ADT_CD3", "ADT_CD8"]
keep = keep_gene + keep_adt
idx = keep + data.var_names.tolist()[:100]
data = data[:, idx]
data.layers["log"] = np.log1p(data.X)
data
AnnData object with n_obs × n_vars = 8617 × 112
    obs: 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.8', 'seurat_clusters', 'nCount_ADT', 'ident', 'cell_type'
    var: 'name'
    layers: 'log'

Simulation

We input the reference data and use the one-shot function to simulate CITE-seq dat using discrete cell types as the covariates for fitting each gene’s marginal distribution.

test = scDesign3Py.scDesign3(n_cores=3,parallelization="pbmcmapply")
test.set_r_random_seed(123)
simu_res = test.scdesign3(
    anndata=data,
    default_assay_name="counts",
    celltype="cell_type",
    mu_formula="cell_type",
    sigma_formula="cell_type",
    family_use="nb",
    usebam=False,
    corr_formula="cell_type",
    copula="vine",
    nonnegative=True,
    nonzerovar=True,
)

After the simulation, we can create the AnnData object using the synthetic count matrix and store the logcounts to the input and synthetic AnnData objects.

simu_data = ad.AnnData(
    X=simu_res["new_count"], obs=simu_res["new_covariate"], layers={"log": np.log1p(simu_res["new_count"])}
)

Then, we obtained the PCA and UMAP for both the inputted reference data and the synthetic data. These sets of embedding will be used for the visualization below.

reducer_umap = umap.UMAP(n_neighbors=15, min_dist=0.1)
reducer_pca = PCA(n_components=50, whiten=False)
reducer_pca.fit(data.layers["log"].toarray())
org_pca = reducer_pca.transform(data.layers["log"].toarray())
simu_pca = reducer_pca.transform(simu_data.layers["log"])
reducer_umap.fit(org_pca)
org_umap = reducer_umap.transform(org_pca)
simu_umap = reducer_umap.transform(simu_pca)

Visualization

To visualize the results, we select six genes and reformat their UMAP embedding we got in the previous step.

scaler = MinMaxScaler()
org_data = pd.DataFrame(
    np.concatenate([org_umap, scaler.fit_transform(data.layers["log"].toarray()[:, 0:12])], axis=1),
    columns=["UMAP1", "UMAP2"]
    + ["CD4", "CD14", "CD19", "CD34", "CD3", "CD8"]
    + ["ADT_CD4", "ADT_CD14", "ADT_CD19", "ADT_CD34", "ADT_CD3", "ADT_CD8"],
)
simu_data = pd.DataFrame(
    np.concatenate([simu_umap, scaler.fit_transform(simu_data.layers["log"][:, 0:12])], axis=1),
    columns=["UMAP1", "UMAP2"]
    + ["CD4", "CD14", "CD19", "CD34", "CD3", "CD8"]
    + ["ADT_CD4", "ADT_CD14", "ADT_CD19", "ADT_CD34", "ADT_CD3", "ADT_CD8"],
)

Six genes’ protein and RNA abundances are shown on the cell UMAP embeddings in the inputted reference data and the synthetic data below.

import matplotlib.pyplot as plt
import matplotlib as mpl
Hide code cell source
fig, axes = plt.subplots(2, 8, figsize=(8 * 3, 2 * 3), sharey=True, sharex=True)
colors = plt.get_cmap("viridis")
norm = plt.Normalize(vmax=1, vmin=0)

for i, name in enumerate(
    ["CD3_Protein", "CD3_RNA", "CD8_Protein", "CD8_RNA", "CD14_Protein", "CD14_RNA", "CD19_Protein", "CD19_RNA"]
):
    for j, assay in enumerate(["True Data", "scDesign3"]):
        ax = axes[j][i]
        ax.set_xticks([])
        ax.set_yticks([])
        type = name.split("_")[1]
        gene = name.split("_")[0]
        if type == "Protein":
            gene = "ADT_" + gene
        if assay == "True Data":
            dat = org_data[["UMAP1", "UMAP2", gene]]
            ax.set_title(type)
        else:
            dat = simu_data[["UMAP1", "UMAP2", gene]]

        ax.scatter(dat["UMAP1"], dat["UMAP2"], c=dat[gene], alpha=0.5)

fig.tight_layout()
fig.text(0.5, -0.05, "UMAP1", ha="center", fontsize=15)
fig.text(-0.008, 0.5, "UMAP2", va="center", rotation="vertical", fontsize=15)
fig.text(1, 0.65, "True Data", fontsize=15, rotation=270)
fig.text(1, 0.15, "scDesign3", fontsize=15, rotation=270)

fig.text(0.12, 1.003, "CD3", fontsize=15)
fig.text(0.365, 1.003, "CD8", fontsize=15)
fig.text(0.61, 1.003, "CD14", fontsize=15)
fig.text(0.86, 1.003, "CD19", fontsize=15)

position = fig.add_axes([0.2, -0.1, 0.60, 0.025])
fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=colors), cax=position, orientation="horizontal")
fig.text(0.13, -0.1, "Expression", fontsize=15)

plt.show()
../../_images/18e842fbd6249883621455f80bd91c9b1891ea68704cd63cee19a461e9ee79de.png