thanhleviet · September 26, 2023 15:47
diff --git a/merge_biom.py b/merge_biom.py
 #!/usr/bin/env python
 import argparse
 import sys
 import os
 import pandas as pd
 from biom import load_table, Table

 def merge_biom_files_with_pandas(biom_files):
    dfs = []  # List to store DataFrames

    for idx, file in enumerate(biom_files):
        table = load_table(file)
        data = {taxon: table.data(taxon, axis='observation')[0] for taxon in table.ids(axis='observation')}
        sample_id = os.path.splitext(os.path.basename(file))[0]  # Extract sample ID from file name
        df = pd.DataFrame(data, index=[sample_id]).T
        dfs.append(df)

    # Merge all DataFrames
    merged_df = pd.concat(dfs, axis=1, join='outer')
    merged_df.fillna(0, inplace=True)  # Replace NaN with 0

    return merged_df

 # def derive_metadata_from_id(obs_id):
 #     # Parse the taxonomic string to generate metadata
 #     taxonomy = obs_id.split('|')
 #     return {"taxonomy": taxonomy}
 def derive_metadata_from_id(obs_id):
    # Expected prefixes in order
    prefixes = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"]
    
    # Split the provided taxonomic string
    parts = obs_id.split('|')
    
    # Create a default taxonomy list with default values
    taxonomy = [prefix + "Unassigned" for prefix in prefixes]
    
    for part in parts:
        for idx, prefix in enumerate(prefixes):
            if part.startswith(prefix):
                taxonomy[idx] = part
                
    return {"taxonomy": taxonomy}

 def main():
    parser = argparse.ArgumentParser(description='Merge BIOM files using pandas')
    parser.add_argument('output_file', type=str, help='Output BIOM file')
    parser.add_argument('input_files', type=str, nargs='+', help='Input BIOM files')
    args = parser.parse_args()

    output_file = args.output_file
    input_files = args.input_files

    try:
        merged_df = merge_biom_files_with_pandas(input_files)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)
    
    # Convert merged DataFrame back to a BIOM Table
    sample_ids = list(merged_df.columns)
    observation_ids = list(merged_df.index)
    data = merged_df.values.T.tolist()

    # Convert data to sparse representation
    sparse_data = []
    for row in range(len(observation_ids)):
        for col in range(len(sample_ids)):
            if data[col][row] != 0:
                sparse_data.append([row, col, data[col][row]])

    # Construct the BIOM 1.0 table as a dictionary
    biom_dict = {
        "id": "Merged_Analysis",
        "format": "Biological Observation Matrix 1.0.0",
        "format_url": "http://biom-format.org",
        "generated_by": "Custom_Script",
        "date": pd.Timestamp.now().isoformat(),
        "matrix_element_type": "float",
        "shape": [len(observation_ids), len(sample_ids)],
        "type": "OTU table",
        "matrix_type": "sparse",
        "data": sparse_data,
        "rows": [{"id": obs_id, "metadata": derive_metadata_from_id(obs_id)} for obs_id in observation_ids],
        "columns": [{"id": sample_id, "metadata": None} for sample_id in sample_ids]
    }

    # Serialize as JSON
    try:
        with open(output_file, 'w') as f:
            import json
            json.dump(biom_dict, f, indent=4)
        print(f"Merged biom files to {output_file}")    
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	import argparse
	import sys
	import os
	import pandas as pd
	from biom import load_table, Table

	def merge_biom_files_with_pandas(biom_files):
	dfs = [] # List to store DataFrames

	for idx, file in enumerate(biom_files):
	table = load_table(file)
	data = {taxon: table.data(taxon, axis='observation')[0] for taxon in table.ids(axis='observation')}
	sample_id = os.path.splitext(os.path.basename(file))[0] # Extract sample ID from file name
	df = pd.DataFrame(data, index=[sample_id]).T
	dfs.append(df)

	# Merge all DataFrames
	merged_df = pd.concat(dfs, axis=1, join='outer')
	merged_df.fillna(0, inplace=True) # Replace NaN with 0

	return merged_df

	# def derive_metadata_from_id(obs_id):
	# # Parse the taxonomic string to generate metadata
	# taxonomy = obs_id.split('\|')
	# return {"taxonomy": taxonomy}
	def derive_metadata_from_id(obs_id):
	# Expected prefixes in order
	prefixes = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"]

	# Split the provided taxonomic string
	parts = obs_id.split('\|')

	# Create a default taxonomy list with default values
	taxonomy = [prefix + "Unassigned" for prefix in prefixes]

	for part in parts:
	for idx, prefix in enumerate(prefixes):
	if part.startswith(prefix):
	taxonomy[idx] = part

	return {"taxonomy": taxonomy}

	def main():
	parser = argparse.ArgumentParser(description='Merge BIOM files using pandas')
	parser.add_argument('output_file', type=str, help='Output BIOM file')
	parser.add_argument('input_files', type=str, nargs='+', help='Input BIOM files')
	args = parser.parse_args()

	output_file = args.output_file
	input_files = args.input_files

	try:
	merged_df = merge_biom_files_with_pandas(input_files)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)

	# Convert merged DataFrame back to a BIOM Table
	sample_ids = list(merged_df.columns)
	observation_ids = list(merged_df.index)
	data = merged_df.values.T.tolist()

	# Convert data to sparse representation
	sparse_data = []
	for row in range(len(observation_ids)):
	for col in range(len(sample_ids)):
	if data[col][row] != 0:
	sparse_data.append([row, col, data[col][row]])

	# Construct the BIOM 1.0 table as a dictionary
	biom_dict = {
	"id": "Merged_Analysis",
	"format": "Biological Observation Matrix 1.0.0",
	"format_url": "http://biom-format.org",
	"generated_by": "Custom_Script",
	"date": pd.Timestamp.now().isoformat(),
	"matrix_element_type": "float",
	"shape": [len(observation_ids), len(sample_ids)],
	"type": "OTU table",
	"matrix_type": "sparse",
	"data": sparse_data,
	"rows": [{"id": obs_id, "metadata": derive_metadata_from_id(obs_id)} for obs_id in observation_ids],
	"columns": [{"id": sample_id, "metadata": None} for sample_id in sample_ids]
	}

	# Serialize as JSON
	try:
	with open(output_file, 'w') as f:
	import json
	json.dump(biom_dict, f, indent=4)
	print(f"Merged biom files to {output_file}")
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)

	if __name__ == "__main__":
	main()