DataSoul commited on
Commit
cbf7dfa
1 Parent(s): bda6d05

Upload parquet to.txt-ALMA-make imatrix.py

Browse files
Files changed (1) hide show
  1. parquet to.txt-ALMA-make imatrix.py +23 -0
parquet to.txt-ALMA-make imatrix.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # first need to download from haoranxu/ALMA-R-Preference(https://huggingface.co/datasets/haoranxu/ALMA-R-Preference)
4
+
5
+ # Parquet to txt
6
+ df = pd.read_parquet('haoranxu-ALMA-R-Preference.parquet')
7
+ print(df.columns)
8
+ # text_column = df[['alma_en', 'alma_zh', 'en', 'gpt4_en', 'gpt4_zh', 'zh']]
9
+ # text_column = df[['en', 'zh']]
10
+
11
+
12
+
13
+ # save txt
14
+ with open('haoranxu-ALMA-R-Preference-en-zh--zh-en.txt', 'w', encoding='utf-8') as f:
15
+ for item in df['translation']:
16
+ en_text = item.get('en')
17
+ zh_text = item.get('zh')
18
+ if en_text and zh_text: # check 'en' and 'zh'
19
+ f.write(f"English: {en_text}\nChinese: {zh_text}\n\n")
20
+ f.write(f"Chinese: {zh_text}\nEnglish: {en_text}\n\n")
21
+
22
+
23
+ # then u can use it to make your language imatrix.dat