Skip to content

Instantly share code, notes, and snippets.

View apcamargo's full-sized avatar
🦖

Antônio Camargo apcamargo

🦖
  • University of São Paulo
  • São Paulo, SP, Brazil
  • X @apcamargo_
View GitHub Profile
#!/usr/bin/env python
# https://adamj.eu/tech/2025/07/20/python-fix-brokenpipeerror/
import os
import sys
from contextlib import contextmanager
from typing import Generator
@apcamargo
apcamargo / fix_taxdump.py
Last active May 27, 2026 16:55
Renumber taxids in taxdumps
#!/usr/bin/env python
import argparse
from pathlib import Path
def main():
parser = argparse.ArgumentParser(
description="Renumber taxids in taxdumps."
)
from bisect import bisect_left, bisect_right
from collections import defaultdict
from math import floor, sqrt
def iter_similar_sets(a, b, metric="jaccard", min_similarity=0.0):
"""
Yield pairs of dictionary keys whose set similarity meets a threshold.
Parameters
#!/usr/bin/env python
import argparse
import math
import sys
from dataclasses import dataclass
from pathlib import Path
ANY_BACK = 1e-5
#!/usr/bin/env python
import shutil
from pathlib import Path
from typing import Literal
import pyarrow as pa
import pyarrow.parquet as pq
@apcamargo
apcamargo / find_cutoff.py
Last active October 11, 2025 19:52
Automatic cutoff determination for an arbitrary distribution
from typing import Sequence
import math
def find_cutoff(values: Sequence[float]) -> float:
"""
Determine the cutoff point in a biphasic distribution curve by identifying
the "bending point" where the curve transitions from slowly growing values
to rapidly growing values, using the maximum perpendicular distance method.
@apcamargo
apcamargo / query_sra_duckdb.sh
Created July 13, 2025 21:11
Query SRA metadata stored as Parquet files in S3 using DuckDB
duckdb -c "
INSTALL httpfs;
LOAD httpfs;
INSTALL parquet;
LOAD parquet;
COPY (
SELECT *
FROM read_parquet('s3://sra-pub-metadata-us-east-1/sra/metadata/*')
) TO STDOUT WITH (FORMAT CSV, DELIMITER E'\t', HEADER);"
@apcamargo
apcamargo / download_mg_rast.py
Last active July 12, 2025 21:20
Downloads all the assembled metagenomes available in MG-RAST
#!/usr/bin/env python
import json
import re
import sys
from typing import Generator, Dict, Any, Optional
import requests
from tqdm import tqdm
from pathlib import Path
from typing import Iterator, Optional, Union
import polars as pl
from needletail import parse_fastx_file
from polars.io.plugins import register_io_source
def scan_fastx(fastx_file: Union[str, Path]) -> pl.LazyFrame:
schema = pl.Schema(
@apcamargo
apcamargo / sam2tsv.py
Created March 10, 2025 03:53
Converts alignments stored in the SAM format to a BLAST-like table
#!/usr/bin/env python
"""
This script processes SAM (Sequence Alignment/Map format) inputs from standard
input and extracts alignment information that is then provided in a tab-separated
table. The following fields are produced: query, target, query_length, query_start,
query_end, target_start, target_end, alignment_length, alignment_identity.
This script was designed for use with SAM files produced by minimap2. However,
it will work with any SAM data that: