Last active
July 16, 2025 02:25
-
-
Save ericmjl/e5b267782f9cbd27f712153deab426e1 to your computer and use it in GitHub Desktop.
Unified Laboratory Data Storage for Biological Entities using xarray - demonstrating coordinate-aligned experimental workflows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "anthropic==0.57.1", | |
# "loguru==0.7.3", | |
# "marimo", | |
# "matplotlib==3.10.3", | |
# "numpy==2.3.1", | |
# "pandas==2.3.1", | |
# "scikit-learn==1.7.0", | |
# "scipy==1.16.0", | |
# "seaborn==0.13.2", | |
# "xarray==2025.7.1", | |
# "zarr==3.0.10", | |
# ] | |
# /// | |
import marimo | |
__generated_with = "0.14.10" | |
app = marimo.App(width="columns") | |
@app.cell(hide_code=True) | |
def _(): | |
import marimo as mo | |
return (mo,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
# Unified laboratory data storage for microRNA experiments | |
What if I told you that your entire microRNA experimental lifecycle could live in a single, queryable data structure? From the moment you extract RNA and measure expression levels to the final machine learning predictions, everything coordinate-aligned and ready for analysis. | |
I've been thinking about this problem for years. We generate expression data, then sequence features, then model outputs, then train/test splits. Each piece typically lives in its own file, its own format, with its own indexing scheme. The cognitive overhead of keeping track of which microRNA corresponds to which row in which CSV is exhausting. | |
Here's what I've found works: store everything in a unified xarray Dataset where microRNA identifiers are the shared coordinate system. Your expression measurements, computed features, statistical estimates, and data splits all aligned by the same microRNA IDs. No more integer indices. No more file juggling. Just clean, coordinated data that scales to the cloud. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(): | |
import numpy as np | |
import pandas as pd | |
import xarray as xr | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from datetime import datetime, timedelta | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.model_selection import train_test_split | |
import warnings | |
warnings.filterwarnings("ignore") | |
np.random.seed(42) | |
return datetime, np, plt, sns, timedelta, train_test_split, xr | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
I'm going to walk you through building this system step by step. We'll start simple and add complexity progressively, just like a real experiment unfolds. | |
First, let's generate some microRNAs to work with. | |
""" | |
) | |
return | |
@app.cell | |
def _(): | |
# Generate synthetic microRNA identifiers | |
n_micrornas = 150 | |
mirna_ids = [ | |
f"hsa-miR-{seq_i}" for seq_i in range(1, n_micrornas + 1) | |
] | |
return mirna_ids, n_micrornas | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now we need to model our experimental design factors. In any good microRNA experiment, you're thinking about cell lines, treatments, replicates, time points, and experimental conditions.""") | |
return | |
@app.cell | |
def _(): | |
# Experimental design factors | |
treatments = ["control", "hypoxia", "inflammation"] | |
replicates = [f"rep_{rep_i}" for rep_i in range(1, 4)] | |
cell_lines = [f"cell_line_{cell_i:02d}" for cell_i in range(1, 15)] | |
time_points = ["2h", "6h", "12h", "24h", "48h"] | |
return cell_lines, replicates, time_points, treatments | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""And of course, experiments happen over time. Let's simulate a multi-week microRNA expression study.""") | |
return | |
@app.cell | |
def _(datetime, timedelta): | |
# Generate experimental dates | |
start_date = datetime(2024, 1, 1) | |
experiment_dates = [ | |
start_date + timedelta(days=date_i * 7) for date_i in range(8) | |
] | |
return (experiment_dates,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now comes the fun part - generating some fake expression data and organizing it into our coordinate system. This is where xarray really shines. Instead of managing separate arrays and keeping track of which index corresponds to which condition, we create one unified structure where every measurement knows exactly which microRNA, treatment, and time point it belongs to.""") | |
return | |
@app.cell | |
def _( | |
cell_lines, | |
experiment_dates, | |
mirna_ids, | |
replicates, | |
time_points, | |
treatments, | |
): | |
# Create coordinate system for our experimental data | |
coords = { | |
"mirna": mirna_ids, | |
"treatment": treatments, | |
"replicate": replicates, | |
"time_point": time_points, | |
"cell_line": cell_lines[:10], # Use subset for demonstration | |
"experiment_date": experiment_dates[:4], # Use subset | |
} | |
return (coords,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Time to generate some synthetic expression measurements. I'm adding treatment effects so there's actually signal in our data.""") | |
return | |
@app.cell | |
def _(coords, np): | |
# Generate synthetic expression data with realistic structure | |
shape = tuple(len(coords[dim]) for dim in coords.keys()) | |
base_expression = np.random.lognormal(2, 1, shape) # Log-normal for expression data | |
# Add treatment effects - this is what we'll try to recover later | |
treatment_effects = {"control": 1.0, "hypoxia": 2.5, "inflammation": 0.4} | |
treatments_list = coords["treatment"] | |
for treatment_idx, treatment in enumerate(treatments_list): | |
base_expression[:, treatment_idx, :, :, :, :] *= treatment_effects[treatment] | |
# Add microRNA-specific effects so some microRNAs are inherently more expressed | |
mirna_effects = np.random.lognormal(0, 0.5, len(coords["mirna"])) | |
for mirna_idx in range(len(coords["mirna"])): | |
base_expression[mirna_idx, :, :, :, :, :] *= mirna_effects[mirna_idx] | |
return (base_expression,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now we wrap this into our unified xarray Dataset. This becomes the foundation that everything else builds on.""") | |
return | |
@app.cell | |
def _(base_expression, coords, experiment_dates, mirna_ids, xr): | |
# Create the main laboratory data array | |
lab_data = xr.DataArray( | |
base_expression, | |
coords=coords, | |
dims=list(coords.keys()), | |
name="expression_level", | |
attrs={ | |
"units": "relative_expression_units", | |
"description": "MicroRNA expression levels across experimental conditions", | |
"measurement_protocol": "qRT-PCR_v3.2", | |
"created_date": str(experiment_dates[0]), | |
}, | |
) | |
# Start the unified experiment dataset | |
unified_stage1 = xr.Dataset( | |
data_vars={"expression_level": lab_data}, | |
coords=coords, | |
attrs={ | |
"title": "Unified MicroRNA Experiment Dataset", | |
"description": "Progressive experimental data accumulation", | |
"experiment_type": "mirna_expression_screen", | |
"workflow_stage": "1_laboratory_data", | |
"created_date": str(experiment_dates[0]), | |
"n_micrornas": len(mirna_ids), | |
"storage_format": "xarray_zarr", | |
}, | |
) | |
return lab_data, unified_stage1 | |
@app.cell(hide_code=True) | |
def _(unified_stage1): | |
unified_stage1 | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now let's calculate effect estimates for each experimental factor. This demonstrates the key concept: storing statistical results aligned with our experimental coordinates. We'll calculate mean expression levels for each microRNA, treatment, time point, replicate, and cell line - each stored along its corresponding coordinate dimension.""") | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Let's prepare our data for analysis using xarray's native flattening capabilities. We'll use the built-in `.to_dataframe()` method to convert our multidimensional data into tabular format for calculating effect estimates.""") | |
return | |
@app.cell | |
def _(lab_data): | |
# Use xarray's native flattening - much cleaner than nested loops | |
# This creates a DataFrame with MultiIndex from all coordinates | |
model_data = lab_data.to_dataframe() | |
# Reset index to make coordinates into columns | |
modeling_df = model_data.reset_index() | |
modeling_df | |
return (modeling_df,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Perfect! Now we have our data in the right format for analysis. Each row represents one observation with all the experimental factors as columns. Time to calculate effect estimates for each experimental factor.""") | |
return | |
@app.cell | |
def _(modeling_df): | |
# Calculate simple effect estimates for each experimental factor | |
# This demonstrates coordinate alignment without complex modeling | |
# Calculate mean expression for each level of each factor | |
# MicroRNA effects - mean expression per microRNA | |
mirna_effect_estimates = modeling_df.groupby('mirna')['expression_level'].mean().values | |
mirna_effect_estimates_std = modeling_df.groupby('mirna')['expression_level'].std().values | |
# Treatment effects - mean expression per treatment | |
treatment_effect_estimates = modeling_df.groupby('treatment')['expression_level'].mean().values | |
treatment_effect_estimates_std = modeling_df.groupby('treatment')['expression_level'].std().values | |
# Time point effects - mean expression per time point | |
time_effect_estimates = modeling_df.groupby('time_point')['expression_level'].mean().values | |
time_effect_estimates_std = modeling_df.groupby('time_point')['expression_level'].std().values | |
# Replicate effects - mean expression per replicate | |
replicate_effect_estimates = modeling_df.groupby('replicate')['expression_level'].mean().values | |
replicate_effect_estimates_std = modeling_df.groupby('replicate')['expression_level'].std().values | |
# Cell line effects - mean expression per cell line | |
cell_line_effect_estimates = modeling_df.groupby('cell_line')['expression_level'].mean().values | |
cell_line_effect_estimates_std = modeling_df.groupby('cell_line')['expression_level'].std().values | |
return ( | |
cell_line_effect_estimates, | |
cell_line_effect_estimates_std, | |
mirna_effect_estimates, | |
mirna_effect_estimates_std, | |
replicate_effect_estimates, | |
replicate_effect_estimates_std, | |
time_effect_estimates, | |
time_effect_estimates_std, | |
treatment_effect_estimates, | |
treatment_effect_estimates_std, | |
) | |
@app.cell | |
def _(mo): | |
mo.md(r"""Now we're going to add estimates into the xarray dataset.""") | |
return | |
@app.cell | |
def _( | |
cell_line_effect_estimates, | |
cell_line_effect_estimates_std, | |
mirna_effect_estimates, | |
mirna_effect_estimates_std, | |
replicate_effect_estimates, | |
replicate_effect_estimates_std, | |
time_effect_estimates, | |
time_effect_estimates_std, | |
treatment_effect_estimates, | |
treatment_effect_estimates_std, | |
unified_stage1, | |
): | |
# Add effect estimates to unified dataset, properly aligned with coordinates | |
unified_stage2 = unified_stage1.assign({ | |
'mirna_effects': (['mirna'], mirna_effect_estimates), | |
'mirna_effects_std': (['mirna'], mirna_effect_estimates_std), | |
'treatment_effects': (['treatment'], treatment_effect_estimates), | |
'treatment_effects_std': (['treatment'], treatment_effect_estimates_std), | |
'time_effects': (['time_point'], time_effect_estimates), | |
'time_effects_std': (['time_point'], time_effect_estimates_std), | |
'replicate_effects': (['replicate'], replicate_effect_estimates), | |
'replicate_effects_std': (['replicate'], replicate_effect_estimates_std), | |
'cell_line_effects': (['cell_line'], cell_line_effect_estimates), | |
'cell_line_effects_std': (['cell_line'], cell_line_effect_estimates_std) | |
}) | |
# Update metadata | |
unified_stage2.attrs.update({ | |
"workflow_stage": "2_effect_estimation", | |
"analysis_type": "factor_effects_by_coordinate", | |
"effects_description": "Mean expression effects for each experimental factor, aligned with coordinates" | |
}) | |
return (unified_stage2,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md(r"""Now let's take a look at what the XArray dataset looks like:""") | |
return | |
@app.cell(hide_code=True) | |
def _(unified_stage2): | |
unified_stage2 | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
Perfect! Now we see the addition of effect estimates for each experimental factor. Notice how we have `mirna_effects`, `treatment_effects`, `time_effects`, `replicate_effects`, and `cell_line_effects` - each aligned with their respective coordinate dimensions. Each effect estimate also has its uncertainty stored as `_effects_std`. | |
This is the beauty of the coordinate-based approach: our effect estimates align perfectly with our experimental design coordinates. Each experimental factor gets its own effect estimate with uncertainty, organized by the same coordinate system as our raw data. The treatment effects are indexed by the treatment coordinate, microRNA effects by the mirna coordinate, and so on. | |
The analysis demonstrates how we can take our coordinate-aligned experimental data, calculate statistical summaries, and store those results back into the same unified dataset with proper coordinate alignment. Everything stays connected by our coordinate system. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
## Machine learning features | |
Next up: machine learning features. I want to show you how features can live alongside your experimental data with perfect coordinate alignment. No more wondering which row corresponds to which microRNA. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Let's generate some synthetic features to demonstrate the concept. I'm not actually calculating real microRNA features here - that would require proper bioinformatics libraries and sequence analysis. Instead, I'm just generating what typical ML features might look like: nucleotide composition, thermodynamic properties, and target predictions.""") | |
return | |
@app.cell | |
def _(mirna_ids, np): | |
# Generate synthetic nucleotide composition features | |
nucleotides = ["A", "U", "G", "C"] | |
feature_data = {} | |
# Fake nucleotide composition (normalized to sum to 1) | |
nt_composition = np.random.rand(len(mirna_ids), len(nucleotides)) | |
nt_composition = nt_composition / nt_composition.sum(axis=1, keepdims=True) | |
for nt_idx, nt in enumerate(nucleotides): | |
feature_data[f"nt_{nt}"] = nt_composition[:, nt_idx] | |
return (feature_data,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now let's add some fake thermodynamic properties. In reality, these would be calculated from the actual microRNA sequences using libraries like ViennaRNA or RNAfold.""") | |
return | |
@app.cell | |
def _(feature_data, mirna_ids, np): | |
# Generate fake thermodynamic properties | |
feature_data["length"] = np.random.randint(18, 25, len(mirna_ids)) | |
feature_data["gc_content"] = np.random.uniform(0.3, 0.7, len(mirna_ids)) | |
feature_data["mfe_structure"] = np.random.normal(-20, 5, len(mirna_ids)) # Minimum free energy | |
feature_data["seed_stability"] = np.random.normal(-8, 2, len(mirna_ids)) | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""And finally, some synthetic target predictions. These would typically come from tools like TargetScan, miRanda, or PITA.""") | |
return | |
@app.cell | |
def _(feature_data, mirna_ids, np): | |
# Generate fake target prediction features | |
target_classes = ["oncogene", "tumor_suppressor", "metabolic"] | |
target_probs = np.random.dirichlet([1, 1, 1], len(mirna_ids)) | |
for target_idx, target in enumerate(target_classes): | |
feature_data[f"target_{target}"] = target_probs[:, target_idx] | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now comes the key part - adding these features to our unified dataset using the categorical coordinate approach. This keeps everything aligned with our microRNA identifiers.""") | |
return | |
@app.cell | |
def _(feature_data, mirna_ids, np, unified_stage2, xr): | |
# Add ML features to unified dataset using categorical coordinate | |
feature_names = list(feature_data.keys()) | |
feature_matrix = np.array([feature_data[name] for name in feature_names]).T | |
unified_stage3 = unified_stage2.assign( | |
{"ml_features": (["mirna", "feature"], feature_matrix)} | |
).assign_coords(feature=feature_names) | |
unified_stage3.attrs.update({ | |
"workflow_stage": "3_feature_engineering", | |
"n_ml_features": len(feature_data), | |
"feature_types": "nucleotide_composition, thermodynamic_properties, target_predictions", | |
"feature_encoding": "one_hot_and_continuous", | |
}) | |
# Create separate ml_features for backward compatibility | |
ml_features = xr.Dataset( | |
data_vars={name: (["mirna"], values) for name, values in feature_data.items()}, | |
coords={"mirna": mirna_ids}, | |
) | |
return ml_features, unified_stage3 | |
@app.cell(hide_code=True) | |
def _(unified_stage3): | |
unified_stage3 | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
Take a moment to click around and explore this xarray Dataset. Notice how we've got everything joined together: our `expression_level` data lives right next to `model_coefficients` which live right next to `ml_features`. Everything is coordinate-aligned by microRNA identifier. | |
This is the magic - you can slice across any dimension and everything stays connected. Want features for specific microRNAs? The model results for those same microRNAs come along automatically. This unified structure is going to be extremely helpful when we start building complex analysis pipelines. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
Here's how simple it becomes to extract data across our unified structure. Notice the clean xarray syntax - we can slice features, select specific nucleotides, or grab target predictions with just coordinate indexing: | |
```python | |
# Get nucleotide features for first 10 microRNAs | |
nt_features = unified_stage3.ml_features.sel( | |
feature=[f for f in unified_stage3.feature.values if f.startswith("nt_")] | |
).isel(mirna=slice(0, 10)) | |
# Get target predictions | |
target_features = unified_stage3.ml_features.sel( | |
feature=[f for f in unified_stage3.feature.values if f.startswith("target_")] | |
) | |
``` | |
Everything stays coordinate-aligned automatically. No manual bookkeeping required. | |
Below you'll see some plots that demonstrate this convenience. They were made using exactly this selection syntax - clean coordinate-based indexing rather than having to do weird joins across multiple DataFrames. The unified structure makes data extraction for visualization incredibly straightforward. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(ml_features, plt, sns, unified_stage3): | |
# Visualize ML features using the new coordinate-based structure | |
features_fig, features_axes = plt.subplots(2, 2, figsize=(12, 8)) | |
# Nucleotide composition heatmap (using unified dataset) | |
nt_feature_mask = [f.startswith("nt_") for f in unified_stage3.feature.values] | |
nt_features = unified_stage3.feature.values[nt_feature_mask] | |
nt_data = unified_stage3.ml_features.sel(feature=nt_features).values[ | |
:30, : | |
] # First 30 microRNAs | |
sns.heatmap( | |
nt_data, | |
xticklabels=[f.split("_")[1] for f in nt_features], | |
yticklabels=unified_stage3.mirna.values[:30], | |
ax=features_axes[0, 0], | |
cmap="viridis", | |
) | |
features_axes[0, 0].set_title("Nucleotide Composition (First 30 microRNAs)") | |
# MicroRNA properties (using backward compatibility dataset) | |
ml_features.length.plot(ax=features_axes[0, 1]) | |
features_axes[0, 1].set_title("MicroRNA Length Distribution") | |
# GC content vs MFE structure | |
features_axes[1, 0].scatter( | |
ml_features.gc_content, ml_features.mfe_structure, alpha=0.6 | |
) | |
features_axes[1, 0].set_xlabel("GC Content") | |
features_axes[1, 0].set_ylabel("MFE Structure") | |
features_axes[1, 0].set_title("Thermodynamic Properties") | |
# Target predictions (using unified dataset) | |
target_feature_mask = [ | |
f.startswith("target_") for f in unified_stage3.feature.values | |
] | |
target_features = unified_stage3.feature.values[target_feature_mask] | |
target_data = unified_stage3.ml_features.sel(feature=target_features).values | |
target_means = target_data.mean(axis=0) | |
features_axes[1, 1].bar(range(len(target_features)), target_means) | |
features_axes[1, 1].set_xticks(range(len(target_features))) | |
features_axes[1, 1].set_xticklabels([f.split("_")[1] for f in target_features]) | |
features_axes[1, 1].set_title("Average Target Class Predictions") | |
plt.tight_layout() | |
plt.show() | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Finally, let's add train/test splits. The beauty here is that splits are stored as boolean masks aligned with our microRNA coordinate. No more integer indices that break when you reorder your data.""") | |
return | |
@app.cell | |
def _(): | |
# Define our train/test split strategies | |
# Note: temporal_split is just a nod to cheminformaticians - we're not actually using temporal data here | |
split_configs = { | |
"random_80_20": {"test_size": 0.2, "random_state": 42}, | |
"random_70_30": {"test_size": 0.3, "random_state": 123}, | |
"temporal_split": {"test_size": 0.25, "random_state": 456}, | |
} | |
return (split_configs,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Now let's generate the actual splits and create the boolean masks that will become part of our unified dataset.""") | |
return | |
@app.cell | |
def _(mirna_ids, n_micrornas, np, split_configs, train_test_split): | |
# Generate the splits and create boolean masks | |
splits_data = {} | |
split_types = list(split_configs.keys()) | |
# Create matrices to hold all train/test masks | |
n_splits = len(split_types) | |
train_masks = np.zeros((n_micrornas, n_splits), dtype=bool) | |
test_masks = np.zeros((n_micrornas, n_splits), dtype=bool) | |
for split_idx, (config_split_name, config) in enumerate(split_configs.items()): | |
train_mirnas, test_mirnas = train_test_split( | |
mirna_ids, | |
test_size=config["test_size"], | |
random_state=config["random_state"], | |
) | |
# Create boolean masks for easy indexing | |
train_mask = np.isin(mirna_ids, train_mirnas) | |
test_mask = np.isin(mirna_ids, test_mirnas) | |
# Store in matrices | |
train_masks[:, split_idx] = train_mask | |
test_masks[:, split_idx] = test_mask | |
splits_data[config_split_name] = { | |
"train_mirnas": train_mirnas, | |
"test_mirnas": test_mirnas, | |
"train_mask": train_mask, | |
"test_mask": test_mask, | |
} | |
return split_types, splits_data, test_masks, train_masks | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md("""Finally, let's add these splits to our unified dataset to complete the experimental workflow.""") | |
return | |
@app.cell | |
def _(split_configs, split_types, test_masks, train_masks, unified_stage3): | |
# Add train/test splits using the new split_type dimension | |
unified_final = unified_stage3.assign({ | |
'train_mask': (['mirna', 'split_type'], train_masks), | |
'test_mask': (['mirna', 'split_type'], test_masks) | |
}).assign_coords(split_type=split_types) | |
unified_final.attrs.update({ | |
"workflow_stage": "4_data_splits_complete", | |
"n_split_strategies": len(split_configs), | |
"split_configs": str(split_configs), | |
"experiment_complete": True, | |
}) | |
return (unified_final,) | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md(r"""Let's view the final unified dataset below.""") | |
return | |
@app.cell(hide_code=True) | |
def _(unified_final): | |
unified_final | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
Take a moment to explore this final unified dataset. Click around and notice how `train_mask` and `test_mask` are now accessible by both `mirna` and `split_type` coordinates. This is the beauty of xarray's coordinate system - instead of having six separate variables cluttering our namespace, we have a clean dimensional structure. | |
The coordinate system for `split_type` makes it so much easier to handle different splitting strategies. Want to compare training sets across all split types? Easy. Need just the temporal split masks? Simple coordinate selection. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
Here's how elegant the selection becomes with proper coordinates: | |
```python | |
# Get training mask for random 80/20 split | |
train_mask = unified_final.train_mask.sel(split_type='random_80_20') | |
# Get test mask for temporal split | |
test_mask = unified_final.test_mask.sel(split_type='temporal_split') | |
# Compare training set sizes across all split types | |
train_counts = unified_final.train_mask.sum(dim='mirna') | |
``` | |
No more remembering variable names like `split_random_80_20_train_mask`. Just clean, intuitive coordinate-based selection. The plots below demonstrate this elegant syntax in action. | |
""" | |
) | |
return | |
@app.cell(hide_code=True) | |
def _(lab_data, plt, splits_data, unified_final): | |
# Demonstrate using splits with actual data | |
splits_fig, splits_axes = plt.subplots(1, 3, figsize=(15, 4)) | |
for viz_split_idx, (viz_split_name, viz_split_data) in enumerate( | |
splits_data.items() | |
): | |
# Get training data for this split using our new split_type dimension | |
viz_train_mask = unified_final.train_mask.sel(split_type=viz_split_name).values | |
viz_test_mask = unified_final.test_mask.sel(split_type=viz_split_name).values | |
# Calculate mean expression for train/test sets | |
viz_train_expression = lab_data.isel(mirna=viz_train_mask).mean( | |
dim=[ | |
"treatment", | |
"replicate", | |
"time_point", | |
"cell_line", | |
"experiment_date", | |
] | |
) | |
viz_test_expression = lab_data.isel(mirna=viz_test_mask).mean( | |
dim=[ | |
"treatment", | |
"replicate", | |
"time_point", | |
"cell_line", | |
"experiment_date", | |
] | |
) | |
# Plot distributions | |
splits_axes[viz_split_idx].hist( | |
viz_train_expression.values, | |
alpha=0.7, | |
label=f"Train (n={viz_train_mask.sum()})", | |
bins=20, | |
) | |
splits_axes[viz_split_idx].hist( | |
viz_test_expression.values, | |
alpha=0.7, | |
label=f"Test (n={viz_test_mask.sum()})", | |
bins=20, | |
) | |
splits_axes[viz_split_idx].set_title( | |
f"{viz_split_name.replace('_', ' ').title()}" | |
) | |
splits_axes[viz_split_idx].set_xlabel("Mean Expression") | |
splits_axes[viz_split_idx].set_ylabel("Count") | |
splits_axes[viz_split_idx].legend() | |
plt.tight_layout() | |
plt.show() | |
# Example of accessing specific microRNAs in a split | |
example_split = "random_80_20" | |
example_train_mirnas = splits_data[example_split]["train_mirnas"][:5] | |
example_test_mirnas = splits_data[example_split]["test_mirnas"][:5] | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
## Storing as zarr | |
The final step is saving our unified dataset. Zarr format is perfect for this - it's cloud-native, supports chunking, and preserves all our coordinate information and metadata. | |
""" | |
) | |
return | |
@app.cell | |
def _(unified_final): | |
# Save the unified dataset to zarr format | |
import tempfile | |
from pathlib import Path | |
from loguru import logger | |
# Save the progressively-built unified dataset | |
temp_dir = Path(tempfile.mkdtemp()) | |
zarr_path = temp_dir / "complete_mirna_experiment_lifecycle.zarr" | |
unified_final.to_zarr(zarr_path, mode="w") | |
logger.info("Complete microRNA experimental lifecycle saved to zarr") | |
logger.info(f"Zarr file: {zarr_path}") | |
logger.info(f"Final size: {unified_final.nbytes / 1e6:.2f} MB") | |
logger.info(f"Workflow stage: {unified_final.attrs['workflow_stage']}") | |
return | |
@app.cell(hide_code=True) | |
def _(mo): | |
mo.md( | |
""" | |
## What have we built? | |
I cooked up this example while at SciPy 2025, while attending Ian Hunt-Isaak's talk on XArray for Biology, as an example of how we can unify data storage for microRNA experimental data with machine learning data. When everything lives in a single xarray Dataset, you stop losing track of which microRNA corresponds to which row in which CSV file. Your expression measurements, computed features, model results, and data splits all stay connected through the same coordinate system. | |
Here's what I love about this setup. You can slice across the entire experimental timeline with one line of code. Need ML features for highly-expressed microRNAs in your training set? It's just coordinate selection. Want to see which treatment effects your model captured? Same coordinate system makes it trivial. | |
```python | |
# Save entire experimental workflow to cloud | |
unified_final.to_zarr('s3://biodata/mirna_screen_2024.zarr') | |
# Load and reproduce analysis anywhere | |
experiment = xr.open_zarr('s3://biodata/mirna_screen_2024.zarr') | |
# Get training set microRNAs | |
train_mask = experiment.train_mask.sel(split_type='random_80_20') | |
training_mirnas = experiment.mirna.where(train_mask, drop=True) | |
# Get ML features for training set | |
ml_data = experiment.ml_features.sel(mirna=training_mirnas) | |
# Or combine with high expression condition | |
high_expression_condition = experiment.expression_level.mean(['treatment', 'replicate', 'time_point', 'cell_line', 'experiment_date']) > 10 | |
combined_condition = high_expression_condition & train_mask | |
ml_data_filtered = experiment.ml_features.sel(mirna=experiment.mirna.where(combined_condition, drop=True)) | |
``` | |
Time will distill the best practices in your context, but I've found this unified approach eliminates so much cognitive overhead. No more file juggling. No more wondering if your indices are still aligned. Just clean, coordinated data that scales to the cloud. | |
""" | |
) | |
return | |
if __name__ == "__main__": | |
app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment