Skip to content

Instantly share code, notes, and snippets.

@maujim
Created April 20, 2025 14:10
Show Gist options
  • Save maujim/5223d4b04f6044d46c469669add7a000 to your computer and use it in GitHub Desktop.
Save maujim/5223d4b04f6044d46c469669add7a000 to your computer and use it in GitHub Desktop.
import os
import json
import pandas as pd
from datasets import load_dataset
def infer_modalities(array_len, prefix):
# Heuristic: chunk into 7-element blocks
modalities = {}
block_size = 7
i = 0
idx = 0
while idx < array_len:
end = min(idx + block_size, array_len)
key = f"{prefix}_{i}"
modalities[key] = {
"start": idx,
"end": end
}
idx = end
i += 1
return modalities
def main(dataset_path: str, output_path: str = "meta/modality.json"):
print(f"Loading dataset from {dataset_path}")
ds = load_dataset(dataset_path, split="train") # adjust split if needed
parquet_path = ds[0]['data']['file'] if 'data' in ds[0] else None
if parquet_path is None:
raise RuntimeError("No 'data' field found in HuggingFace dataset example.")
print(f"Reading first parquet file: {parquet_path}")
df = pd.read_parquet(parquet_path)
state_example = df['observation.state'].iloc[0]
action_example = df['action'].iloc[0]
print(f"State length: {len(state_example)}")
print(f"Action length: {len(action_example)}")
state_modalities = infer_modalities(len(state_example), "state")
action_modalities = infer_modalities(len(action_example), "action")
modality_config = {
"state": state_modalities,
"action": action_modalities,
"video": {
"ego_view": {
"original_key": "observation.images.ego_view"
}
}
}
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(modality_config, f, indent=4)
print(f"✅ modality.json written to {output_path}")
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python generate_modality.py <huggingface_dataset_path>")
exit(1)
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment