Created
April 20, 2025 14:10
-
-
Save maujim/5223d4b04f6044d46c469669add7a000 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import pandas as pd | |
from datasets import load_dataset | |
def infer_modalities(array_len, prefix): | |
# Heuristic: chunk into 7-element blocks | |
modalities = {} | |
block_size = 7 | |
i = 0 | |
idx = 0 | |
while idx < array_len: | |
end = min(idx + block_size, array_len) | |
key = f"{prefix}_{i}" | |
modalities[key] = { | |
"start": idx, | |
"end": end | |
} | |
idx = end | |
i += 1 | |
return modalities | |
def main(dataset_path: str, output_path: str = "meta/modality.json"): | |
print(f"Loading dataset from {dataset_path}") | |
ds = load_dataset(dataset_path, split="train") # adjust split if needed | |
parquet_path = ds[0]['data']['file'] if 'data' in ds[0] else None | |
if parquet_path is None: | |
raise RuntimeError("No 'data' field found in HuggingFace dataset example.") | |
print(f"Reading first parquet file: {parquet_path}") | |
df = pd.read_parquet(parquet_path) | |
state_example = df['observation.state'].iloc[0] | |
action_example = df['action'].iloc[0] | |
print(f"State length: {len(state_example)}") | |
print(f"Action length: {len(action_example)}") | |
state_modalities = infer_modalities(len(state_example), "state") | |
action_modalities = infer_modalities(len(action_example), "action") | |
modality_config = { | |
"state": state_modalities, | |
"action": action_modalities, | |
"video": { | |
"ego_view": { | |
"original_key": "observation.images.ego_view" | |
} | |
} | |
} | |
os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
with open(output_path, "w") as f: | |
json.dump(modality_config, f, indent=4) | |
print(f"✅ modality.json written to {output_path}") | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) < 2: | |
print("Usage: python generate_modality.py <huggingface_dataset_path>") | |
exit(1) | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment