DingZhenDojoCat
/

Video-R1

Model card Files Files and versions

Video-R1 / get_parquet_data.py

DingZhenDojoCat's picture

DingZhenDojoCat

Add files using upload-large-folder tool

bb7f76d verified 5 months ago

history blame contribute delete

1.64 kB

	import json
	import io
	from datasets import Dataset, Features, Sequence, Value, Image
	from PIL import Image as PILImage

	# 1️⃣ Load your JSON file (which is a top-level list of dicts)
	with open("Train_QA_10k_noFreeForm.json", "r") as f:
	records = json.load(f) # List[Dict]

	# 2️⃣ Build an HF Dataset
	ds = Dataset.from_list(records)

	# 3️⃣ Read each image file into raw bytes
	def read_image_bytes(example):
	with open(example["path"], "rb") as img_f:
	example["image_bytes"] = img_f.read()
	return example

	# we keep all original columns + add "image_bytes"
	ds = ds.map(read_image_bytes, remove_columns=[])

	# 4️⃣ Define your schema, telling HF that image_bytes is binary
	features = Features({
	"problem_id": Value("int64"),
	"problem": Value("string"),
	"data_type": Value("string"),
	"problem_type": Value("string"),
	"options": Sequence(Value("string")),
	"solution": Value("string"),
	"data_source": Value("string"),
	# "prompt": Value("string"),
	"answer": Value("string"),
	"path": Value("string"),
	"image_bytes": Value("binary"), # ← raw bytes in Arrow
	})
	ds = ds.cast(features)

	# 5️⃣ Rename, and cast that byte-column to an Image feature that decodes to PIL
	ds = ds.rename_column("image_bytes", "images")
	ds = ds.cast_column("images", Image(decode=True))

	# 6️⃣ Sanity-check
	img0 = ds[0]["images"]
	print(img0)
	# → PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384

	# 7️⃣ Finally, write out to Parquet (the bytes go in the file)
	ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet")