SGEcon commited on
Commit
c6b0748
โ€ข
1 Parent(s): d2b1968

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +32 -27
README.md CHANGED
@@ -1,6 +1,14 @@
1
  ---
2
  library_name: transformers
3
- license: apache-2.0
 
 
 
 
 
 
 
 
4
  ---
5
 
6
 
@@ -37,42 +45,39 @@ If you wish to use the original data rather than our training data, please conta
37
  ## Conducting Conversation
38
 
39
  import re
 
40
  def gen(x):
41
  inputs = tokenizer(f"### ์งˆ๋ฌธ: {x}\n\n### ๋‹ต๋ณ€:", return_tensors='pt', return_token_type_ids=False)
42
 
43
- # Move data to GPU (if available)
44
  inputs = {k: v.to(device="cuda" if torch.cuda.is_available() else "cpu") for k, v in inputs.items()}
45
 
46
  gened = model.generate(
47
  **inputs,
48
- max_new_tokens=256,
49
  early_stopping=True,
50
- num_return_sequences=4,
51
- do_sample=True,
52
- eos_token_id=tokenizer.eos_token_id,
53
- temperature=0.9,
54
- top_p=0.8,
55
- top_k=50
56
  )
57
 
58
- complete_answers = []
59
- for gen_seq in gened:
60
- decoded = tokenizer.decode(gen_seq, skip_special_tokens=True).strip()
61
-
62
- # Extract only the text after the string "### ๋‹ต๋ณ€:"
63
- first_answer_start_idx = decoded.find("### ๋‹ต๋ณ€:") + len("### ๋‹ต๋ณ€:")
64
- temp_answer = decoded[first_answer_start_idx:].strip()
65
-
66
- # Extract only text up to the second "### ๋‹ต๋ณ€:" string
67
- second_answer_start_idx = temp_answer.find("### ๋‹ต๋ณ€:")
68
- if second_answer_start_idx != -1:
69
- complete_answer = temp_answer[:second_answer_start_idx].strip()
70
- else:
71
- complete_answer = temp_answer # ๋‘ ๋ฒˆ์งธ "### ๋‹ต๋ณ€:"์ด ์—†๋Š” ๊ฒฝ์šฐ ์ „์ฒด ๋‹ต๋ณ€ ๋ฐ˜ํ™˜
72
-
73
- complete_answers.append(complete_answer)
74
 
75
- return complete_answers
 
76
 
77
 
78
 
@@ -133,4 +138,4 @@ If you wish to use the original data rather than our training data, please conta
133
 
134
  ## Citation [optional]
135
 
136
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 
1
  ---
2
  library_name: transformers
3
+ license: cc-by-nc-4.0
4
+ datasets:
5
+ - kyujinpy/KOR-OpenOrca-Platypus-v3
6
+ language:
7
+ - ko
8
+ - en
9
+ tags:
10
+ - Economic
11
+ - Finance
12
  ---
13
 
14
 
 
45
  ## Conducting Conversation
46
 
47
  import re
48
+
49
  def gen(x):
50
  inputs = tokenizer(f"### ์งˆ๋ฌธ: {x}\n\n### ๋‹ต๋ณ€:", return_tensors='pt', return_token_type_ids=False)
51
 
52
+ # ๋ฐ์ดํ„ฐ๋ฅผ GPU๋กœ ์ด๋™(์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๊ฒฝ์šฐ)
53
  inputs = {k: v.to(device="cuda" if torch.cuda.is_available() else "cpu") for k, v in inputs.items()}
54
 
55
  gened = model.generate(
56
  **inputs,
57
+ max_new_tokens=256, # ์ƒˆ๋กœ ์ƒ์„ฑํ•  ํ† ํฐ์˜ ์ตœ๋Œ€ ๊ฐœ์ˆ˜
58
  early_stopping=True,
59
+ num_return_sequences=1, # ํ•˜๋‚˜์˜ ๋‹ต๋ณ€๋งŒ ์ƒ์„ฑ
60
+ do_sample=True, # ๋‹ค์–‘ํ•œ ๋‹ต๋ณ€ ์ƒ์„ฑ์„ ์œ„ํ•ด ์ƒ˜ํ”Œ๋ง ํ™œ์„ฑํ™”
61
+ eos_token_id=tokenizer.eos_token_id, # EOS ํ† ํฐ ID ์‚ฌ์šฉ
62
+ temperature=0.9, # ์ƒ์„ฑ ๋‹ค์–‘์„ฑ ์กฐ์ ˆ์„ ์œ„ํ•œ ์˜จ๋„ ์„ค์ •
63
+ top_p=0.8, # nucleus sampling์—์„œ ์‚ฌ์šฉํ•  p ๊ฐ’
64
+ top_k=50 # top-k sampling์—์„œ ์‚ฌ์šฉํ•  k ๊ฐ’
65
  )
66
 
67
+ # ์ƒ์„ฑ๋œ ์‹œํ€€์Šค๋ฅผ ๋””์ฝ”๋“œํ•˜์—ฌ ์ถœ๋ ฅ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜
68
+ decoded = tokenizer.decode(gened[0], skip_special_tokens=True).strip()
69
+
70
+ # "### ๋‹ต๋ณ€:" ๋ฌธ์ž์—ด ์ดํ›„์˜ ํ…์ŠคํŠธ๋งŒ ์ถ”์ถœ
71
+ answer_start_idx = decoded.find("### ๋‹ต๋ณ€:") + len("### ๋‹ต๋ณ€:")
72
+ complete_answer = decoded[answer_start_idx:].strip()
73
+
74
+ # ์ฒซ ๋ฒˆ์งธ ๊ตฌ๋‘์ (. ? !)์„ ์ฐพ์•„์„œ ๊ทธ ๋ถ€๋ถ„๊นŒ์ง€๋งŒ ์ถ”์ถœ
75
+ match = re.search(r"[\.\?\!][^\.\?\!]*$", complete_answer)
76
+ if match:
77
+ complete_answer = complete_answer[:match.end()].strip()
 
 
 
 
 
78
 
79
+ return complete_answer
80
+
81
 
82
 
83
 
 
138
 
139
  ## Citation [optional]
140
 
141
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->