jlzhou RandomTao commited on
Commit
74c04ce
β€’
1 Parent(s): e291b64

[Doc] Add the use of the tables in the QuickStart (#2)

Browse files

- [Doc] Add the use of the tables in the QuickStart (c7c5c53e2f19d645b8029b029daeafe131e9b8b1)


Co-authored-by: taozhang <RandomTao@users.noreply.huggingface.co>

Files changed (1) hide show
  1. README.md +45 -15
README.md CHANGED
@@ -52,9 +52,9 @@ For now, the standalone decoder is open-sourced and fully functional without hav
52
 
53
  This model is static, trained on an offline dataset. Future versions may be released to enhance its performance on specialized tasks.
54
 
55
- **Quickstart**
56
 
57
- Here provides a code snippet with apply_chat_template to show you how to load the tokenizer and model and how to generate contents.
58
 
59
  > Note that you need `transformers>=4.37.0` to use `TableGPT2`:
60
  > ```sh
@@ -64,33 +64,62 @@ Here provides a code snippet with apply_chat_template to show you how to load th
64
  ```python
65
  from transformers import AutoModelForCausalLM, AutoTokenizer
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  model_name = "tablegpt/TableGPT2-7B"
68
 
69
  model = AutoModelForCausalLM.from_pretrained(
70
- model_name,
71
- torch_dtype="auto",
72
- device_map="auto"
73
  )
74
  tokenizer = AutoTokenizer.from_pretrained(model_name)
75
 
76
- prompt = "Hey, who are you?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  messages = [
78
  {"role": "system", "content": "You are a helpful assistant."},
79
- {"role": "user", "content": prompt}
80
  ]
81
  text = tokenizer.apply_chat_template(
82
- messages,
83
- tokenize=False,
84
- add_generation_prompt=True
85
  )
86
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
87
 
88
- generated_ids = model.generate(
89
- **model_inputs,
90
- max_new_tokens=512
91
- )
92
  generated_ids = [
93
- output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
 
94
  ]
95
 
96
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -121,6 +150,7 @@ For deployment, we recommend using vLLM.
121
  }'
122
 
123
  ```
 
124
 
125
  **License**
126
 
 
52
 
53
  This model is static, trained on an offline dataset. Future versions may be released to enhance its performance on specialized tasks.
54
 
55
+ **QuickStart**
56
 
57
+ This code snippet demonstrates how to build a prompt with table information, and shows how to load the tokenizer, load the model, and generate content.
58
 
59
  > Note that you need `transformers>=4.37.0` to use `TableGPT2`:
60
  > ```sh
 
64
  ```python
65
  from transformers import AutoModelForCausalLM, AutoTokenizer
66
 
67
+ # Using pandas to read some structured data
68
+ import pandas as pd
69
+ from io import StringIO
70
+
71
+ # single table
72
+ EXAMPLE_CSV_CONTENT = """
73
+ "Loss","Date","Score","Opponent","Record","Attendance"
74
+ "Hampton (14–12)","September 25","8–7","Padres","67–84","31,193"
75
+ "Speier (5–3)","September 26","3–1","Padres","67–85","30,711"
76
+ "Elarton (4–9)","September 22","3–1","@ Expos","65–83","9,707"
77
+ "Lundquist (0–1)","September 24","15–11","Padres","67–83","30,774"
78
+ "Hampton (13–11)","September 6","9–5","Dodgers","61–78","31,407"
79
+ """
80
+
81
+ csv_file = StringIO(EXAMPLE_CSV_CONTENT)
82
+ df = pd.read_csv(csv_file)
83
+ # Some data preprocessing
84
+ # code
85
+
86
  model_name = "tablegpt/TableGPT2-7B"
87
 
88
  model = AutoModelForCausalLM.from_pretrained(
89
+ model_name, torch_dtype="auto", device_map="auto"
 
 
90
  )
91
  tokenizer = AutoTokenizer.from_pretrained(model_name)
92
 
93
+ example_prompt_template = """Given access to several pandas dataframes, write the Python code to answer the user's question.
94
+
95
+ /*
96
+ "{var_name}.head(5).to_string(index=False)" as follows:
97
+ {df_info}
98
+ */
99
+
100
+ Question: {user_question}
101
+ """
102
+ question = "ε“ͺδΊ›ζ―”θ΅›ηš„ζˆ˜η»©θΎΎεˆ°δΊ†40θƒœ40负?"
103
+
104
+ prompt = example_prompt_template.format(
105
+ var_name="df",
106
+ df_info=df.head(5).to_string(index=False),
107
+ user_question=question,
108
+ )
109
+
110
  messages = [
111
  {"role": "system", "content": "You are a helpful assistant."},
112
+ {"role": "user", "content": prompt},
113
  ]
114
  text = tokenizer.apply_chat_template(
115
+ messages, tokenize=False, add_generation_prompt=True
 
 
116
  )
117
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
118
 
119
+ generated_ids = model.generate(**model_inputs, max_new_tokens=512)
 
 
 
120
  generated_ids = [
121
+ output_ids[len(input_ids) :]
122
+ for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
123
  ]
124
 
125
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
150
  }'
151
 
152
  ```
153
+ For more details about how to use TableGPT2, please refer to [our repository on GitHub](https://github.com/tablegpt/tablegpt-agent)
154
 
155
  **License**
156