Last active
September 26, 2023 15:47
-
-
Save thanhleviet/1ad6b2d37711b3078755a0793e779e5e to your computer and use it in GitHub Desktop.
Python script for meging multiple biom files and write the merged file out in BIOM 1.0 format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import sys | |
import os | |
import pandas as pd | |
from biom import load_table, Table | |
def merge_biom_files_with_pandas(biom_files): | |
dfs = [] # List to store DataFrames | |
for idx, file in enumerate(biom_files): | |
table = load_table(file) | |
data = {taxon: table.data(taxon, axis='observation')[0] for taxon in table.ids(axis='observation')} | |
sample_id = os.path.splitext(os.path.basename(file))[0] # Extract sample ID from file name | |
df = pd.DataFrame(data, index=[sample_id]).T | |
dfs.append(df) | |
# Merge all DataFrames | |
merged_df = pd.concat(dfs, axis=1, join='outer') | |
merged_df.fillna(0, inplace=True) # Replace NaN with 0 | |
return merged_df | |
# def derive_metadata_from_id(obs_id): | |
# # Parse the taxonomic string to generate metadata | |
# taxonomy = obs_id.split('|') | |
# return {"taxonomy": taxonomy} | |
def derive_metadata_from_id(obs_id): | |
# Expected prefixes in order | |
prefixes = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"] | |
# Split the provided taxonomic string | |
parts = obs_id.split('|') | |
# Create a default taxonomy list with default values | |
taxonomy = [prefix + "Unassigned" for prefix in prefixes] | |
for part in parts: | |
for idx, prefix in enumerate(prefixes): | |
if part.startswith(prefix): | |
taxonomy[idx] = part | |
return {"taxonomy": taxonomy} | |
def main(): | |
parser = argparse.ArgumentParser(description='Merge BIOM files using pandas') | |
parser.add_argument('output_file', type=str, help='Output BIOM file') | |
parser.add_argument('input_files', type=str, nargs='+', help='Input BIOM files') | |
args = parser.parse_args() | |
output_file = args.output_file | |
input_files = args.input_files | |
try: | |
merged_df = merge_biom_files_with_pandas(input_files) | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
# Convert merged DataFrame back to a BIOM Table | |
sample_ids = list(merged_df.columns) | |
observation_ids = list(merged_df.index) | |
data = merged_df.values.T.tolist() | |
# Convert data to sparse representation | |
sparse_data = [] | |
for row in range(len(observation_ids)): | |
for col in range(len(sample_ids)): | |
if data[col][row] != 0: | |
sparse_data.append([row, col, data[col][row]]) | |
# Construct the BIOM 1.0 table as a dictionary | |
biom_dict = { | |
"id": "Merged_Analysis", | |
"format": "Biological Observation Matrix 1.0.0", | |
"format_url": "http://biom-format.org", | |
"generated_by": "Custom_Script", | |
"date": pd.Timestamp.now().isoformat(), | |
"matrix_element_type": "float", | |
"shape": [len(observation_ids), len(sample_ids)], | |
"type": "OTU table", | |
"matrix_type": "sparse", | |
"data": sparse_data, | |
"rows": [{"id": obs_id, "metadata": derive_metadata_from_id(obs_id)} for obs_id in observation_ids], | |
"columns": [{"id": sample_id, "metadata": None} for sample_id in sample_ids] | |
} | |
# Serialize as JSON | |
try: | |
with open(output_file, 'w') as f: | |
import json | |
json.dump(biom_dict, f, indent=4) | |
print(f"Merged biom files to {output_file}") | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment