Multisample analysis#

Setup#

import missionbio.mosaic as ms
from IPython.display import display, HTML
# Load a multi-sample file
# By default, a multi-sample h5 file loads as a SampleGroup object

group = ms.load_example_dataset("Multisample PBMC")
# To analyze one sample at a time access, them using the samples attribute
sample = group.samples[1]

# These are the samples in the h5 file
[s.name for s in group.samples]
['Sample 1', 'Sample 2']

Applying functions#

# To apply a function across all samples use the `apply` method on the SampleGroup
# It returns a list of returned objects for each sample

def filt(sample):
    filt_vars = sample.dna.filter_variants()
    return filt_vars

filtered_variants = group.apply(filt)
filtered_variants
[array(['chr2:198266943:C/T', 'chr2:198267770:G/GAA', 'chr4:55599436:T/C',
        'chr5:170837457:A/G', 'chr7:140449071:C/G', 'chr7:148504716:AG/A',
        'chr7:148504854:A/AGACTT', 'chr7:148508833:A/G',
        'chr7:148543525:A/G', 'chr7:148543583:G/C', 'chr7:148543693:TA/T',
        'chr11:32417945:T/C', 'chr11:119148573:G/T', 'chr13:28602292:T/C',
        'chr17:7578115:T/C', 'chr17:7579801:G/C', 'chr17:29559932:C/A',
        'chr20:31024028:G/A', 'chrX:39933339:A/G', 'chrX:44833841:C/A',
        'chrX:133547814:T/C'], dtype='<U23'),
 array(['chr2:198266943:C/T', 'chr2:198267770:G/GAA', 'chr4:55599436:T/C',
        'chr5:170837457:A/G', 'chr7:140449071:C/G', 'chr7:148504716:AG/A',
        'chr7:148504854:A/AGACTT', 'chr7:148508833:A/G',
        'chr7:148543525:A/G', 'chr7:148543583:G/C', 'chr7:148543693:TA/T',
        'chr11:32417945:T/C', 'chr11:119148573:G/T', 'chr13:28602292:T/C',
        'chr17:7578115:T/C', 'chr17:7579801:G/C', 'chr17:29559932:C/A',
        'chr20:31024028:G/A', 'chrX:39933339:A/G', 'chrX:44833841:C/A',
        'chrX:133547814:T/C'], dtype='<U23')]
# Subset the same variants in all dna assays
# It is important to maintain the same variants across all dna assays

og_num_vars = [s.dna.shape[1] for s in group.samples]

var_union = list(set().union(*filtered_variants))
for sample in group:
    sample.dna = sample.dna[:, var_union]  # Subsets all samples with the same variants

new_num_vars = [s.dna.shape[1] for s in group.samples]

print(og_num_vars, new_num_vars)  # Thee old and new number of variants for each sample in the group
[21, 21] [21, 21]
# The functions applied on each sample can be more complex - like this assignment and relabeling method
# Note the original labels can be uncoordinated across samples in the group
# The labels are changed to ensure that each label is for the same clone

variants_of_interest = ['chr7:148508833:A/G', 'chr17:29559932:C/A', 'chr4:55599436:T/C']
def cluster(sample):
    clone_table = sample.dna.group_by_genotype(variants_of_interest, max_ado_score=0.8)

    # Rename labels so that each sample has the same labels
    # Here the signature of each variant is used to rename the labels
    df = sample.dna.signature("NGT").loc[:, variants_of_interest]
    names = df.apply(lambda vs: "-".join([str(int(v)) for v in vs]), axis=1)
    label_map = {i: n for i, n in names.items()}
    
    # Don't rename the outlier categories:
    for lab in ["missing", "small", "ADO"]:
        del label_map[lab]

    sample.dna.rename_labels(label_map)
    clone_table = clone_table.rename(index=label_map)

    return clone_table  # Return the clone table

tables = group.apply(cluster)

for t in tables:  # The clone tables for each sample
    display(HTML(t.to_html()))
clone 1 4 6 Missing GT clones (4) Small subclones (12) ADO clones (4)
chr7:148508833:A/G Het (50.2%) WT (2.14%) Het (35.02%) Missing in 0.00% of cells WT (32.3%) Het (47.42%)
chr17:29559932:C/A Het (49.37%) Het (51.98%) Het (49.73%) Missing in 0.10% of cells Het (49.89%) Het (50.23%)
chr4:55599436:T/C Hom (99.81%) Het (53.44%) Het (73.29%) Missing in 0.67% of cells Hom (70.91%) Hom (99.71%)
Total Cell Number 1377 (70.87%) 87 (4.48%) 79 (4.07%) 15 (0.77%) 56 (2.88%) 329 (16.93%)
Sample 1 Cell Number 1377 (70.87%) 87 (4.48%) 79 (4.07%) 15 (0.77%) 56 (2.88%) 329 (16.93%)
Parents [6] NaN NaN NaN NaN NaN
Sisters [small] NaN NaN NaN NaN NaN
ADO score 0.0 0 0 NaN NaN NaN
clone 1 4 6 Missing GT clones (4) Small subclones (7) ADO clones (4)
chr7:148508833:A/G WT (1.03%) Het (29.15%) Het (49.14%) Missing in 0.00% of cells WT (20.71%) WT (0.95%)
chr17:29559932:C/A Het (49.45%) Het (49.2%) Het (49.25%) Missing in 0.06% of cells Het (47.46%) Het (50.09%)
chr4:55599436:T/C Het (50.56%) Het (70.28%) Hom (99.42%) Missing in 0.94% of cells Hom (67.46%) Het (47.08%)
Total Cell Number 1189 (65.76%) 79 (4.37%) 74 (4.09%) 18 (1.0%) 66 (3.65%) 382 (21.13%)
Sample 2 Cell Number 1189 (65.76%) 79 (4.37%) 74 (4.09%) 18 (1.0%) 66 (3.65%) 382 (21.13%)
Parents NaN NaN NaN NaN NaN NaN
Sisters NaN NaN NaN NaN NaN NaN
ADO score 0 0 0 NaN NaN NaN

Drawing figures#

# Displaying the same plot across all the samples

for sample in group:
    fig = sample.dna.heatmap("NGT_FILTERED", features=variants_of_interest)
    fig.show("jpg")
../_images/1cde6de1cf93917e6cc848e7b660a9ad4c608f4ac4d1be97fd75e267d557288b.jpg ../_images/d330bf0e3eb9876aafe977694f091bd4f4629b80b1ecf1627a58306c63ba785a.jpg
# Normalize Protein and look for patterns

for sample in group:
    sample.protein.normalize_reads()
    fig = sample.heatmap(("dna", "protein"))
    fig.show("jpg")
../_images/2657872157380a5d06a99e7cac8bd9436dfa52a1278c6817fd5c07444b804e01.jpg ../_images/fcc11501aab337eeda0754599ab79c6d45cb7ff40f9ee518468453524764a698.jpg
# Ignore warnings raised when running clone_vs_analyte

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
variants_of_interest = ['chr17:29559932:C/A', 'chr4:55599436:T/C', 'chr7:148508833:A/G']
proteins_of_interest = ['CD19', "CD34", "CD30"]
clones_of_interest = ["0-1-1", "1-1-2"]

for sample in group:
    s = sample[sample.dna.barcodes(clones_of_interest)]
    s.dna = s.dna[:, variants_of_interest]
    s.protein = s.protein[:, proteins_of_interest]
    s.clone_vs_analyte("protein")
../_images/49ebb4d13e63f52a553fd36e9471c2c4c45cfca3f3ef25ac1c58553a5ffdaf49.png ../_images/e57809fd728700846c641d45c78584bbf94f470b93c6f873a6680407008e550e.png

Multisample plots#

# Draw a fishplot for the dna labels
# From the proportions in the heatmaps - The two clones of interest are 0-1-1 and 2-20

fig = group.fishplot(
    "dna",
    sample_order=["Sample 1", "Sample 2"],
    labels=["0-1-1", "1-1-2"],
    parents=[None, None]
)
fig.show("jpg")
../_images/ef9c1065a0531c4d1964a01946cc9ce3a798e498739503a46a7cf0eaa4a85973.jpg
# Draw a barplot for the dna labels

fig = group.barplot(
    "dna",
    sample_order=["Sample 1", "Sample 2"],
    label_order=["0-1-1", "1-1-2"],
    percentage=True
)
fig.show("jpg")
../_images/08b687bb987ef5ab80e3850d79b1393304ddc73756b3ad39fa5799b784f59c04.jpg