Upload parquet to.txt-ALMA-make imatrix.py
Browse files
parquet to.txt-ALMA-make imatrix.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
# first need to download from haoranxu/ALMA-R-Preference(https://huggingface.co/datasets/haoranxu/ALMA-R-Preference)
|
4 |
+
|
5 |
+
# Parquet to txt
|
6 |
+
df = pd.read_parquet('haoranxu-ALMA-R-Preference.parquet')
|
7 |
+
print(df.columns)
|
8 |
+
# text_column = df[['alma_en', 'alma_zh', 'en', 'gpt4_en', 'gpt4_zh', 'zh']]
|
9 |
+
# text_column = df[['en', 'zh']]
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
# save txt
|
14 |
+
with open('haoranxu-ALMA-R-Preference-en-zh--zh-en.txt', 'w', encoding='utf-8') as f:
|
15 |
+
for item in df['translation']:
|
16 |
+
en_text = item.get('en')
|
17 |
+
zh_text = item.get('zh')
|
18 |
+
if en_text and zh_text: # check 'en' and 'zh'
|
19 |
+
f.write(f"English: {en_text}\nChinese: {zh_text}\n\n")
|
20 |
+
f.write(f"Chinese: {zh_text}\nEnglish: {en_text}\n\n")
|
21 |
+
|
22 |
+
|
23 |
+
# then u can use it to make your language imatrix.dat
|