Skip to content

Instantly share code, notes, and snippets.

@thanhleviet
Last active September 26, 2023 15:47
Show Gist options
  • Save thanhleviet/1ad6b2d37711b3078755a0793e779e5e to your computer and use it in GitHub Desktop.
Save thanhleviet/1ad6b2d37711b3078755a0793e779e5e to your computer and use it in GitHub Desktop.
Python script for meging multiple biom files and write the merged file out in BIOM 1.0 format
#!/usr/bin/env python
import argparse
import sys
import os
import pandas as pd
from biom import load_table, Table
def merge_biom_files_with_pandas(biom_files):
dfs = [] # List to store DataFrames
for idx, file in enumerate(biom_files):
table = load_table(file)
data = {taxon: table.data(taxon, axis='observation')[0] for taxon in table.ids(axis='observation')}
sample_id = os.path.splitext(os.path.basename(file))[0] # Extract sample ID from file name
df = pd.DataFrame(data, index=[sample_id]).T
dfs.append(df)
# Merge all DataFrames
merged_df = pd.concat(dfs, axis=1, join='outer')
merged_df.fillna(0, inplace=True) # Replace NaN with 0
return merged_df
# def derive_metadata_from_id(obs_id):
# # Parse the taxonomic string to generate metadata
# taxonomy = obs_id.split('|')
# return {"taxonomy": taxonomy}
def derive_metadata_from_id(obs_id):
# Expected prefixes in order
prefixes = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"]
# Split the provided taxonomic string
parts = obs_id.split('|')
# Create a default taxonomy list with default values
taxonomy = [prefix + "Unassigned" for prefix in prefixes]
for part in parts:
for idx, prefix in enumerate(prefixes):
if part.startswith(prefix):
taxonomy[idx] = part
return {"taxonomy": taxonomy}
def main():
parser = argparse.ArgumentParser(description='Merge BIOM files using pandas')
parser.add_argument('output_file', type=str, help='Output BIOM file')
parser.add_argument('input_files', type=str, nargs='+', help='Input BIOM files')
args = parser.parse_args()
output_file = args.output_file
input_files = args.input_files
try:
merged_df = merge_biom_files_with_pandas(input_files)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
# Convert merged DataFrame back to a BIOM Table
sample_ids = list(merged_df.columns)
observation_ids = list(merged_df.index)
data = merged_df.values.T.tolist()
# Convert data to sparse representation
sparse_data = []
for row in range(len(observation_ids)):
for col in range(len(sample_ids)):
if data[col][row] != 0:
sparse_data.append([row, col, data[col][row]])
# Construct the BIOM 1.0 table as a dictionary
biom_dict = {
"id": "Merged_Analysis",
"format": "Biological Observation Matrix 1.0.0",
"format_url": "http://biom-format.org",
"generated_by": "Custom_Script",
"date": pd.Timestamp.now().isoformat(),
"matrix_element_type": "float",
"shape": [len(observation_ids), len(sample_ids)],
"type": "OTU table",
"matrix_type": "sparse",
"data": sparse_data,
"rows": [{"id": obs_id, "metadata": derive_metadata_from_id(obs_id)} for obs_id in observation_ids],
"columns": [{"id": sample_id, "metadata": None} for sample_id in sample_ids]
}
# Serialize as JSON
try:
with open(output_file, 'w') as f:
import json
json.dump(biom_dict, f, indent=4)
print(f"Merged biom files to {output_file}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment