# Harmonization of Multi-Site Data When brain images are acquired at different sites or on different scanners, the resulting scalar measurements contain site-specific technical biases that can mask or mimic real biological effects. **ComBat** removes these additive and multiplicative site effects while preserving biological covariates like age and sex. This notebook demonstrates the `harmonize_volumes` function on synthetic data with known site biases and a known age effect. ## Pipeline overview 1. Generate synthetic multi-site scalar maps with known biases 2. Visualize site effects before harmonization 3. Run ComBat harmonization 4. Compare before and after 5. Verify that the biological (age) effect is preserved ## 1. Generate synthetic multi-site data We create synthetic scalar volumes for 3 "sites" with 8 subjects each. Each site has a different additive shift and multiplicative scale applied on top of a shared biological signal (a smooth gradient + age effect). ```python import matplotlib.pyplot as plt import numpy as np import pandas as pd from kwneuro.resource import InMemoryVolumeResource rng = np.random.default_rng(123) shape = (30, 36, 20) n_per_site = 8 sites = ["Scanner_A", "Scanner_B", "Scanner_C"] # Smooth base signal (shared biology) x = np.linspace(0.2, 0.8, shape[0])[:, None, None] y = np.linspace(0.3, 0.6, shape[1])[None, :, None] base = x * y * np.ones(shape) # Site-specific biases site_params = { "Scanner_A": {"shift": 0.0, "scale": 1.0}, "Scanner_B": {"shift": 0.15, "scale": 1.4}, "Scanner_C": {"shift": -0.1, "scale": 0.7}, } # Brain mask mask_array = np.zeros(shape, dtype=np.int8) mask_array[4:26, 4:32, 3:17] = 1 mask = InMemoryVolumeResource(array=mask_array, affine=np.eye(4), metadata={}) # Generate volumes volumes = [] site_labels = [] ages = [] for site in sites: p = site_params[site] for _ in range(n_per_site): age = rng.uniform(25, 75) age_effect = age * 0.002 noise = rng.normal(0, 0.015, shape) vol_data = (base + age_effect + noise) * p["scale"] + p["shift"] volumes.append(InMemoryVolumeResource( array=vol_data, affine=np.eye(4), metadata={} )) site_labels.append(site) ages.append(age) covars = pd.DataFrame({"site": site_labels, "age": ages}) print(f"Generated {len(volumes)} volumes across {len(sites)} sites") print(covars.groupby("site")["age"].describe()[["count", "mean", "std"]]) ``` Generated 24 volumes across 3 sites count mean std site Scanner_A 8.0 54.924903 15.648836 Scanner_B 8.0 57.283493 12.979633 Scanner_C 8.0 52.865805 12.695440 ## 2. Visualize site effects before harmonization Each site's voxel-value distribution is shifted and scaled differently. ```python mask_idx = np.nonzero(mask_array > 0) mid_slice = shape[2] // 2 fig, axes = plt.subplots(1, 4, figsize=(14, 3), width_ratios=[3, 1, 1, 1]) # Histograms for site, color in zip(sites, ["tab:blue", "tab:orange", "tab:green"]): site_voxels = [] for vol, s in zip(volumes, site_labels): if s == site: site_voxels.append(vol.get_array()[mask_idx]) all_voxels = np.concatenate(site_voxels) axes[0].hist(all_voxels, bins=60, alpha=0.5, label=site, color=color, density=True) axes[0].set_xlabel("Voxel value") axes[0].set_ylabel("Density") axes[0].set_title("Before harmonization") axes[0].legend() # Example slices (one subject per site, shared color range) all_masked = np.concatenate([v.get_array()[mask_idx] for v in volumes]) vmin, vmax = np.percentile(all_masked, [1, 99]) for i, site in enumerate(sites): idx = site_labels.index(site) arr = volumes[idx].get_array()[:, :, mid_slice] axes[i + 1].imshow(arr.T, cmap="viridis", origin="lower", vmin=vmin, vmax=vmax) axes[i + 1].set_title(site) axes[i + 1].axis("off") plt.tight_layout() plt.show() ``` ![png](example-harmonization_files/example-harmonization_4_0.png) ## 3. Run ComBat harmonization ```python from kwneuro.harmonize import harmonize_volumes harmonized, estimates = harmonize_volumes( volumes, covars, batch_col="site", mask=mask, continuous_cols=["age"], ) print(f"Harmonized {len(harmonized)} volumes") ``` [neuroCombat] Creating design matrix [neuroCombat] Standardizing data across features [neuroCombat] Fitting L/S model and finding priors [neuroCombat] Finding parametric adjustments [neuroCombat] Final adjustment of data Harmonized 24 volumes ## 4. Compare before and after After harmonization, the per-site distributions should overlap. ```python fig, axes = plt.subplots(1, 4, figsize=(14, 3), width_ratios=[3, 1, 1, 1]) for site, color in zip(sites, ["tab:blue", "tab:orange", "tab:green"]): site_voxels = [] for vol, s in zip(harmonized, site_labels): if s == site: site_voxels.append(vol.get_array()[mask_idx]) all_voxels = np.concatenate(site_voxels) axes[0].hist(all_voxels, bins=60, alpha=0.5, label=site, color=color, density=True) axes[0].set_xlabel("Voxel value") axes[0].set_ylabel("Density") axes[0].set_title("After harmonization") axes[0].legend() # Use same color range as the "before" plot for comparison for i, site in enumerate(sites): idx = site_labels.index(site) arr = harmonized[idx].get_array()[:, :, mid_slice] axes[i + 1].imshow(arr.T, cmap="viridis", origin="lower", vmin=vmin, vmax=vmax) axes[i + 1].set_title(site) axes[i + 1].axis("off") plt.tight_layout() plt.show() # Print site means print("Per-site mean voxel values:") for site in sites: before_means = [ v.get_array()[mask_idx].mean() for v, s in zip(volumes, site_labels) if s == site ] after_means = [ v.get_array()[mask_idx].mean() for v, s in zip(harmonized, site_labels) if s == site ] print( f" {site:12s} before: {np.mean(before_means):.4f} " f"after: {np.mean(after_means):.4f}" ) ``` ![png](example-harmonization_files/example-harmonization_8_0.png) Per-site mean voxel values: Scanner_A before: 0.3348 after: 0.3640 Scanner_B before: 0.6254 after: 0.3685 Scanner_C before: 0.1315 after: 0.3595 ## 5. Verify covariate preservation The age effect should survive harmonization. We check the correlation between age and mean voxel intensity before and after. ```python means_before = [v.get_array()[mask_idx].mean() for v in volumes] means_after = [v.get_array()[mask_idx].mean() for v in harmonized] corr_before = np.corrcoef(ages, means_before)[0, 1] corr_after = np.corrcoef(ages, means_after)[0, 1] fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True) for site, color in zip(sites, ["tab:blue", "tab:orange", "tab:green"]): mask_s = [s == site for s in site_labels] site_ages = [a for a, m in zip(ages, mask_s) if m] site_before = [m for m, s in zip(means_before, mask_s) if s] site_after = [m for m, s in zip(means_after, mask_s) if s] axes[0].scatter(site_ages, site_before, label=site, color=color, s=50) axes[1].scatter(site_ages, site_after, label=site, color=color, s=50) axes[0].set_xlabel("Age") axes[0].set_ylabel("Mean voxel value") axes[0].set_title(f"Before (r = {corr_before:.3f})") axes[0].legend() axes[1].set_xlabel("Age") axes[1].set_title(f"After (r = {corr_after:.3f})") axes[1].legend() plt.suptitle("Age vs. mean voxel value") plt.tight_layout() plt.show() print(f"Age correlation before: {corr_before:.4f}") print(f"Age correlation after: {corr_after:.4f}") ``` ![png](example-harmonization_files/example-harmonization_10_0.png) Age correlation before: 0.2661 Age correlation after: 0.9691