shs131566 commited on
Commit
e4dd662
โ€ข
1 Parent(s): e489bb6

Upload major_area_meeting_transcripts.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. major_area_meeting_transcripts.py +201 -0
major_area_meeting_transcripts.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import os
3
+ import csv
4
+ import json
5
+ from train_url import _TRAIN_DATA_URLS
6
+ from validation_url import _VALIDATION_DATA_URLS
7
+
8
+
9
+ _HOMEPAGE = "https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=data&dataSetSn=464"
10
+ _DESCRIPTION = "AI HUB ์ฃผ์š” ์˜์—ญ๋ณ„ ํšŒ์˜ ์Œ์„ฑ ๋ฐ์ดํ„ฐ์…‹"
11
+
12
+
13
+ class MajorAreaMeetingConfig(datasets.BuilderConfig):
14
+ def __init__(self, name, **kwargs):
15
+ super(MajorAreaMeetingConfig, self).__init__(
16
+ version=datasets.Version("1.0.0", ""),
17
+ name=name,
18
+ **kwargs,
19
+ )
20
+ if name == "all":
21
+ self.data_urls = {
22
+ "train": _TRAIN_DATA_URLS["๊ณต์ค‘ํŒŒ๋ฐฉ์†ก"]
23
+ + _TRAIN_DATA_URLS["๊ธฐํƒ€๋…น์Œ"]
24
+ + _TRAIN_DATA_URLS["๋ผ๋””์˜ค"]
25
+ + _TRAIN_DATA_URLS["์ธํ„ฐ๋„ท๋ฐฉ์†ก"],
26
+ "validation": _VALIDATION_DATA_URLS["๊ณต์ค‘ํŒŒ๋ฐฉ์†ก"]
27
+ + _VALIDATION_DATA_URLS["๊ธฐํƒ€๋…น์Œ"]
28
+ + _VALIDATION_DATA_URLS["๋ผ๋””์˜ค"]
29
+ + _VALIDATION_DATA_URLS["์ธํ„ฐ๋„ท๋ฐฉ์†ก"],
30
+ }
31
+ self.transcript_urls = {
32
+ "train": [
33
+ path.replace("data.tar.gz", "data.jsonl")
34
+ for path in self.data_urls["train"]
35
+ ],
36
+ "validation": [
37
+ path.replace("data.tar.gz", "data.jsonl")
38
+ for path in self.data_urls["validation"]
39
+ ],
40
+ }
41
+ else:
42
+ self.transcript_urls = {
43
+ "train": _TRAIN_DATA_URLS[name],
44
+ "validation": _VALIDATION_DATA_URLS[name],
45
+ }
46
+
47
+
48
+ class MajorAreaMeeting(datasets.GeneratorBasedBuilder):
49
+
50
+ BUILDER_CONFIGS = [
51
+ MajorAreaMeetingConfig(name="all"),
52
+ MajorAreaMeetingConfig(name="๊ณต์ค‘ํŒŒ๋ฐฉ์†ก"),
53
+ MajorAreaMeetingConfig(name="๊ธฐํƒ€๋…น์Œ"),
54
+ MajorAreaMeetingConfig(name="๋ผ๋””์˜ค"),
55
+ MajorAreaMeetingConfig(name="์ธํ„ฐ๋„ท๋ฐฉ์†ก"),
56
+ ]
57
+
58
+ def _info(self):
59
+ return datasets.DatasetInfo(
60
+ description=_DESCRIPTION,
61
+ features=datasets.Features(
62
+ {
63
+ "id": datasets.Value("string"),
64
+ "path": datasets.Value("string"),
65
+ "audio": datasets.features.Audio(sampling_rate=16_000),
66
+ "speaker_id": datasets.Value("string", id=None),
67
+ "speaker_name": datasets.Value("string", id=None),
68
+ "speaker_age": datasets.Value("string", id=None),
69
+ "speaker_occupation": datasets.Value("string", id=None),
70
+ "speaker_role": datasets.Value("string", id=None),
71
+ "form": datasets.Value("string"),
72
+ "original_form": datasets.Value("string"),
73
+ "environment": datasets.Value("string"),
74
+ "isIdiom": datasets.Value("bool"),
75
+ "hangeulToEnglish": datasets.Sequence(
76
+ {
77
+ "id": datasets.Value("string", id=None),
78
+ "hangeul": datasets.Value("string", id=None),
79
+ "english": datasets.Value("string", id=None),
80
+ "begin": datasets.Value("int32"),
81
+ "end": datasets.Value("int32"),
82
+ }
83
+ ),
84
+ "hangeulToNumber": datasets.Sequence(
85
+ {
86
+ "id": datasets.Value("string", id=None),
87
+ "hangeul": datasets.Value("string", id=None),
88
+ "number": datasets.Value("string", id=None),
89
+ "begin": datasets.Value("int32"),
90
+ "end": datasets.Value("int32"),
91
+ }
92
+ ),
93
+ "term": datasets.Sequence(
94
+ {
95
+ "id": datasets.Value("string", id=None),
96
+ "hangeul": datasets.Value("string", id=None),
97
+ "begin": datasets.Value("int32"),
98
+ "end": datasets.Value("int32"),
99
+ }
100
+ ),
101
+ }
102
+ ),
103
+ )
104
+
105
+ def _split_generators(self, dl_manager):
106
+ transcripts = dl_manager.download(
107
+ {
108
+ "train": self.config.transcript_urls["train"],
109
+ "validation": self.config.transcript_urls["validation"],
110
+ }
111
+ )
112
+ audio_filenames_paths = dl_manager.download(
113
+ {
114
+ "train": self.config.data_urls["train"],
115
+ "validation": self.config.data_urls["validation"],
116
+ }
117
+ )
118
+
119
+ audio_archives = audio_filenames_paths
120
+
121
+ local_extracted_archives = (
122
+ dl_manager.extract(audio_archives) if not dl_manager.is_streaming else {}
123
+ )
124
+
125
+ return [
126
+ datasets.SplitGenerator(
127
+ name=datasets.Split.TRAIN,
128
+ gen_kwargs={
129
+ "transcript_paths": [
130
+ transcript for transcript in transcripts["train"]
131
+ ],
132
+ "audio_archives": [
133
+ dl_manager.iter_archive(archive)
134
+ for archive in audio_archives["train"]
135
+ ],
136
+ "local_extracted_archive": local_extracted_archives.get("train"),
137
+ },
138
+ ),
139
+ datasets.SplitGenerator(
140
+ name=datasets.Split.VALIDATION,
141
+ gen_kwargs={
142
+ "transcript_paths": [
143
+ transcript for transcript in transcripts["validation"]
144
+ ],
145
+ "audio_archives": [
146
+ dl_manager.iter_archive(archive)
147
+ for archive in audio_archives["validation"]
148
+ ],
149
+ "local_extracted_archive": local_extracted_archives.get(
150
+ "validation"
151
+ ),
152
+ },
153
+ ),
154
+ ]
155
+
156
+ def _generate_examples(
157
+ self,
158
+ transcript_paths,
159
+ audio_archives,
160
+ local_extracted_archive,
161
+ ):
162
+
163
+ transcripts = {}
164
+
165
+ with open(transcript_paths[0], "r", encoding="utf-8") as file:
166
+ for line in file:
167
+ data = json.loads(line)
168
+ transcripts[data["id"]] = data
169
+
170
+ for archive_idx, audio_archive in enumerate(audio_archives):
171
+ for audio_filename, file in audio_archive:
172
+ local_audio_file_path = (
173
+ os.path.join(local_extracted_archive[archive_idx], audio_filename)
174
+ if local_extracted_archive
175
+ else None
176
+ )
177
+ data = transcripts[os.path.basename(audio_filename)[:-4]]
178
+
179
+ yield audio_filename, {
180
+ "id": data["id"],
181
+ "speaker_id": data["speaker_id"],
182
+ "speaker_name": data["speaker_name"],
183
+ "speaker_age": data["speaker_age"],
184
+ "speaker_occupation": data["speaker_occupation"],
185
+ "speaker_role": data["speaker_role"],
186
+ "form": data["form"],
187
+ "original_form": data["original_form"],
188
+ "environment": data["environment"],
189
+ "isIdiom": data["isIdiom"],
190
+ "hangeulToEnglish": data["hangulToEnglish"],
191
+ "hangeulToNumber": data["hangulToNumber"],
192
+ "term": data["term"],
193
+ "audio": {
194
+ "path": (
195
+ local_audio_file_path
196
+ if local_audio_file_path
197
+ else audio_filename
198
+ ),
199
+ "bytes": file.read(),
200
+ },
201
+ }