retrained with new data from opensubs. qwerty subs
Browse files- config.json +1 -1
- dataset_dict.json +1 -0
- eval/data-00000-of-00001.arrow +3 -0
- eval/dataset_info.json +65 -0
- eval/state.json +13 -0
- model.safetensors +1 -1
- tokenizer_config.json +0 -4
- train/data-00000-of-00002.arrow +3 -0
- train/data-00001-of-00002.arrow +3 -0
- train/dataset_info.json +65 -0
- train/state.json +16 -0
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"T5ForConditionalGeneration"
|
| 5 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "t5-small",
|
| 3 |
"architectures": [
|
| 4 |
"T5ForConditionalGeneration"
|
| 5 |
],
|
dataset_dict.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"splits": ["train", "eval"]}
|
eval/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e3914c2653952d5ef669d4dec5cfa59bd7587d0999b3038929d02ef51c3f3f7
|
| 3 |
+
size 187010360
|
eval/dataset_info.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "csv",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "csv",
|
| 6 |
+
"dataset_size": 472235292,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"/home/ubuntu/wwdrive2/14March/clean_train.csv": {
|
| 10 |
+
"num_bytes": 369392984,
|
| 11 |
+
"checksum": null
|
| 12 |
+
},
|
| 13 |
+
"/home/ubuntu/wwdrive2/14March/clean_eval.csv": {
|
| 14 |
+
"num_bytes": 71786407,
|
| 15 |
+
"checksum": null
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"download_size": 441179391,
|
| 19 |
+
"features": {
|
| 20 |
+
"input_ids": {
|
| 21 |
+
"feature": {
|
| 22 |
+
"dtype": "int32",
|
| 23 |
+
"_type": "Value"
|
| 24 |
+
},
|
| 25 |
+
"_type": "Sequence"
|
| 26 |
+
},
|
| 27 |
+
"attention_mask": {
|
| 28 |
+
"feature": {
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"_type": "Value"
|
| 31 |
+
},
|
| 32 |
+
"_type": "Sequence"
|
| 33 |
+
},
|
| 34 |
+
"labels": {
|
| 35 |
+
"feature": {
|
| 36 |
+
"dtype": "int64",
|
| 37 |
+
"_type": "Value"
|
| 38 |
+
},
|
| 39 |
+
"_type": "Sequence"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"homepage": "",
|
| 43 |
+
"license": "",
|
| 44 |
+
"size_in_bytes": 913414683,
|
| 45 |
+
"splits": {
|
| 46 |
+
"train": {
|
| 47 |
+
"name": "train",
|
| 48 |
+
"num_bytes": 394316099,
|
| 49 |
+
"num_examples": 4507525,
|
| 50 |
+
"dataset_name": "csv"
|
| 51 |
+
},
|
| 52 |
+
"eval": {
|
| 53 |
+
"name": "eval",
|
| 54 |
+
"num_bytes": 77919193,
|
| 55 |
+
"num_examples": 1127410,
|
| 56 |
+
"dataset_name": "csv"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"version": {
|
| 60 |
+
"version_str": "0.0.0",
|
| 61 |
+
"major": 0,
|
| 62 |
+
"minor": 0,
|
| 63 |
+
"patch": 0
|
| 64 |
+
}
|
| 65 |
+
}
|
eval/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "150697196cb85c2e",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": "eval"
|
| 13 |
+
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 241984552
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35a20c7f6786661d16cdff57b5432726582b676eb3c02e5f7a869c58db2dd3de
|
| 3 |
size 241984552
|
tokenizer_config.json
CHANGED
|
@@ -930,12 +930,8 @@
|
|
| 930 |
"clean_up_tokenization_spaces": true,
|
| 931 |
"eos_token": "</s>",
|
| 932 |
"extra_ids": 100,
|
| 933 |
-
"max_length": 512,
|
| 934 |
"model_max_length": 512,
|
| 935 |
"pad_token": "<pad>",
|
| 936 |
-
"stride": 0,
|
| 937 |
"tokenizer_class": "T5Tokenizer",
|
| 938 |
-
"truncation_side": "right",
|
| 939 |
-
"truncation_strategy": "longest_first",
|
| 940 |
"unk_token": "<unk>"
|
| 941 |
}
|
|
|
|
| 930 |
"clean_up_tokenization_spaces": true,
|
| 931 |
"eos_token": "</s>",
|
| 932 |
"extra_ids": 100,
|
|
|
|
| 933 |
"model_max_length": 512,
|
| 934 |
"pad_token": "<pad>",
|
|
|
|
| 935 |
"tokenizer_class": "T5Tokenizer",
|
|
|
|
|
|
|
| 936 |
"unk_token": "<unk>"
|
| 937 |
}
|
train/data-00000-of-00002.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee121506abd0676a5266c0592070a202642789ef9a38fd5876aa403135c44a04
|
| 3 |
+
size 446399184
|
train/data-00001-of-00002.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3cc62e5c0f5a5137d80e124fb9156e0a60274f953ed6b95347cf7c32df324d25
|
| 3 |
+
size 425352424
|
train/dataset_info.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "csv",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "csv",
|
| 6 |
+
"dataset_size": 472235292,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"/home/ubuntu/wwdrive2/14March/clean_train.csv": {
|
| 10 |
+
"num_bytes": 369392984,
|
| 11 |
+
"checksum": null
|
| 12 |
+
},
|
| 13 |
+
"/home/ubuntu/wwdrive2/14March/clean_eval.csv": {
|
| 14 |
+
"num_bytes": 71786407,
|
| 15 |
+
"checksum": null
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
"download_size": 441179391,
|
| 19 |
+
"features": {
|
| 20 |
+
"input_ids": {
|
| 21 |
+
"feature": {
|
| 22 |
+
"dtype": "int32",
|
| 23 |
+
"_type": "Value"
|
| 24 |
+
},
|
| 25 |
+
"_type": "Sequence"
|
| 26 |
+
},
|
| 27 |
+
"attention_mask": {
|
| 28 |
+
"feature": {
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"_type": "Value"
|
| 31 |
+
},
|
| 32 |
+
"_type": "Sequence"
|
| 33 |
+
},
|
| 34 |
+
"labels": {
|
| 35 |
+
"feature": {
|
| 36 |
+
"dtype": "int64",
|
| 37 |
+
"_type": "Value"
|
| 38 |
+
},
|
| 39 |
+
"_type": "Sequence"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"homepage": "",
|
| 43 |
+
"license": "",
|
| 44 |
+
"size_in_bytes": 913414683,
|
| 45 |
+
"splits": {
|
| 46 |
+
"train": {
|
| 47 |
+
"name": "train",
|
| 48 |
+
"num_bytes": 394316099,
|
| 49 |
+
"num_examples": 4507525,
|
| 50 |
+
"dataset_name": "csv"
|
| 51 |
+
},
|
| 52 |
+
"eval": {
|
| 53 |
+
"name": "eval",
|
| 54 |
+
"num_bytes": 77919193,
|
| 55 |
+
"num_examples": 1127410,
|
| 56 |
+
"dataset_name": "csv"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"version": {
|
| 60 |
+
"version_str": "0.0.0",
|
| 61 |
+
"major": 0,
|
| 62 |
+
"minor": 0,
|
| 63 |
+
"patch": 0
|
| 64 |
+
}
|
| 65 |
+
}
|
train/state.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00002.arrow"
|
| 5 |
+
},
|
| 6 |
+
{
|
| 7 |
+
"filename": "data-00001-of-00002.arrow"
|
| 8 |
+
}
|
| 9 |
+
],
|
| 10 |
+
"_fingerprint": "26c810267bb075b4",
|
| 11 |
+
"_format_columns": null,
|
| 12 |
+
"_format_kwargs": {},
|
| 13 |
+
"_format_type": null,
|
| 14 |
+
"_output_all_columns": false,
|
| 15 |
+
"_split": "train"
|
| 16 |
+
}
|