Video-R1 / get_parquet_data.py
DingZhenDojoCat's picture
Add files using upload-large-folder tool
bb7f76d verified
import json
import io
from datasets import Dataset, Features, Sequence, Value, Image
from PIL import Image as PILImage
# 1️⃣ Load your JSON file (which is a top-level list of dicts)
with open("Train_QA_10k_noFreeForm.json", "r") as f:
records = json.load(f) # List[Dict]
# 2️⃣ Build an HF Dataset
ds = Dataset.from_list(records)
# 3️⃣ Read each image file into raw bytes
def read_image_bytes(example):
with open(example["path"], "rb") as img_f:
example["image_bytes"] = img_f.read()
return example
# we keep all original columns + add "image_bytes"
ds = ds.map(read_image_bytes, remove_columns=[])
# 4️⃣ Define your schema, telling HF that image_bytes is binary
features = Features({
"problem_id": Value("int64"),
"problem": Value("string"),
"data_type": Value("string"),
"problem_type": Value("string"),
"options": Sequence(Value("string")),
"solution": Value("string"),
"data_source": Value("string"),
# "prompt": Value("string"),
"answer": Value("string"),
"path": Value("string"),
"image_bytes": Value("binary"), # ← raw bytes in Arrow
})
ds = ds.cast(features)
# 5️⃣ Rename, and cast that byte-column to an Image feature that decodes to PIL
ds = ds.rename_column("image_bytes", "images")
ds = ds.cast_column("images", Image(decode=True))
# 6️⃣ Sanity-check
img0 = ds[0]["images"]
print(img0)
# → PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384
# 7️⃣ Finally, write out to Parquet (the bytes go in the file)
ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet")