|
|
import json |
|
|
import io |
|
|
from datasets import Dataset, Features, Sequence, Value, Image |
|
|
from PIL import Image as PILImage |
|
|
|
|
|
|
|
|
with open("Train_QA_10k_noFreeForm.json", "r") as f: |
|
|
records = json.load(f) |
|
|
|
|
|
|
|
|
ds = Dataset.from_list(records) |
|
|
|
|
|
|
|
|
def read_image_bytes(example): |
|
|
with open(example["path"], "rb") as img_f: |
|
|
example["image_bytes"] = img_f.read() |
|
|
return example |
|
|
|
|
|
|
|
|
ds = ds.map(read_image_bytes, remove_columns=[]) |
|
|
|
|
|
|
|
|
features = Features({ |
|
|
"problem_id": Value("int64"), |
|
|
"problem": Value("string"), |
|
|
"data_type": Value("string"), |
|
|
"problem_type": Value("string"), |
|
|
"options": Sequence(Value("string")), |
|
|
"solution": Value("string"), |
|
|
"data_source": Value("string"), |
|
|
|
|
|
"answer": Value("string"), |
|
|
"path": Value("string"), |
|
|
"image_bytes": Value("binary"), |
|
|
}) |
|
|
ds = ds.cast(features) |
|
|
|
|
|
|
|
|
ds = ds.rename_column("image_bytes", "images") |
|
|
ds = ds.cast_column("images", Image(decode=True)) |
|
|
|
|
|
|
|
|
img0 = ds[0]["images"] |
|
|
print(img0) |
|
|
|
|
|
|
|
|
|
|
|
ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet") |
|
|
|