import json import io from datasets import Dataset, Features, Sequence, Value, Image from PIL import Image as PILImage # 1️⃣ Load your JSON file (which is a top-level list of dicts) with open("Train_QA_10k_noFreeForm.json", "r") as f: records = json.load(f) # List[Dict] # 2️⃣ Build an HF Dataset ds = Dataset.from_list(records) # 3️⃣ Read each image file into raw bytes def read_image_bytes(example): with open(example["path"], "rb") as img_f: example["image_bytes"] = img_f.read() return example # we keep all original columns + add "image_bytes" ds = ds.map(read_image_bytes, remove_columns=[]) # 4️⃣ Define your schema, telling HF that image_bytes is binary features = Features({ "problem_id": Value("int64"), "problem": Value("string"), "data_type": Value("string"), "problem_type": Value("string"), "options": Sequence(Value("string")), "solution": Value("string"), "data_source": Value("string"), # "prompt": Value("string"), "answer": Value("string"), "path": Value("string"), "image_bytes": Value("binary"), # ← raw bytes in Arrow }) ds = ds.cast(features) # 5️⃣ Rename, and cast that byte-column to an Image feature that decodes to PIL ds = ds.rename_column("image_bytes", "images") ds = ds.cast_column("images", Image(decode=True)) # 6️⃣ Sanity-check img0 = ds[0]["images"] print(img0) # → PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384 # 7️⃣ Finally, write out to Parquet (the bytes go in the file) ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet")