winglian commited on
Commit
2598c9f
1 Parent(s): decb66e

allow the sharegpt handler to also better handle datasets destined for openai finetuning (#1361)

Browse files

* allow the sharegpt handler to also better handle datasets destined for openai finetuning

* make sure to support system role

src/axolotl/prompt_strategies/sharegpt.py CHANGED
@@ -82,7 +82,7 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
82
  basic sharegpt strategy to grab conversations from the sample row
83
  """
84
 
85
- _strict = True
86
 
87
  @property
88
  def strict(self):
@@ -96,10 +96,25 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
96
  conversations = prompt["conversations"]
97
  if self.strict:
98
  return conversations
99
- # remap roles - allow for assistant turn
100
- role_map = {"human": "human", "assistant": "gpt", "gpt": "gpt"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  turns = [
102
- {"from": role_map[t["from"]], "value": t["value"]} for t in conversations
 
103
  ]
104
  return turns
105
 
 
82
  basic sharegpt strategy to grab conversations from the sample row
83
  """
84
 
85
+ _strict = False
86
 
87
  @property
88
  def strict(self):
 
96
  conversations = prompt["conversations"]
97
  if self.strict:
98
  return conversations
99
+ role_key = "from"
100
+ if "role" in conversations[0].keys():
101
+ role_key = "role"
102
+ value_key = "value"
103
+ if "text" in conversations[0].keys():
104
+ value_key = "text"
105
+ elif "content" in conversations[0].keys():
106
+ value_key = "content"
107
+ # remap roles - allow for assistant turn"
108
+ role_map = {
109
+ "user": "human",
110
+ "human": "human",
111
+ "assistant": "gpt",
112
+ "gpt": "gpt",
113
+ "system": "system",
114
+ }
115
  turns = [
116
+ {"from": role_map[t[role_key]], "value": t[value_key]}
117
+ for t in conversations
118
  ]
119
  return turns
120