Spaces:

StevenChen16
/

AI-Lawyer

Sleeping

App Files Files Community

StevenChen16 commited on 11 days ago

Commit

c39e972

•

1 Parent(s): 6c2ef5e

update chat_llama3_8b function

Browse files

Files changed (1) hide show

app.py +90 -56

app.py CHANGED Viewed

@@ -165,67 +165,101 @@ def query_vector_store(vector_store: FAISS, query, k=4, relevance_threshold=0.8)
 @spaces.GPU(duration=120)
 def chat_llama3_8b(message: str,
-              history: list,
-              temperature=0.6,
-              max_new_tokens=4096
-             ) -> str:
     """
-    Generate a streaming response using the llama3-8b model.
-    Will display citations after the response if citations are available.
-    """
-    # Get citations from vector store
-    citation = query_vector_store(vector_store, message, 4, 0.7)
-    # Build conversation history
-    conversation = []
-    for user, assistant in history:
-        conversation.extend([
-            {"role": "user", "content": user},
-            {"role": "assistant", "content": assistant}
-        ])
-    # Construct the final message with background prompt and citations
-    if citation:
-        message = f"{background_prompt}Based on these citations: {citation}\nPlease answer question: {message}"
-    else:
-        message = f"{background_prompt}{message}"
-    conversation.append({"role": "user", "content": message})
-    # Generate response
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        eos_token_id=terminators,
-    )
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        current_output = "".join(outputs)
-        # If we have citations, append them at the end
-        if citation and text == streamer[-1]:  # On the last chunk
-            citation_display = "\n\nReferences:\n" + "\n".join(
-                f"[{i+1}] {cite.strip()}"
-                for i, cite in enumerate(citation.split('\n'))
-                if cite.strip()
-            )
-            current_output += citation_display
-        yield current_output
 # Gradio block

 @spaces.GPU(duration=120)
 def chat_llama3_8b(message: str,
+                  history: list,
+                  temperature=0.6,
+                  max_new_tokens=4096
+                 ) -> str:
     """
+    Generate a streaming response using the LLaMA model.
+    Args:
+        message (str): The current user message
+        history (list): List of previous conversation turns
+        temperature (float): Sampling temperature (0.0 to 1.0)
+        max_new_tokens (int): Maximum number of tokens to generate
+    Returns:
+        str: Generated response with citations if available
+    """
+    try:
+        # 1. Get relevant citations from vector store
+        citation = query_vector_store(vector_store, message, k=4, relevance_threshold=0.7)
+        # 2. Format conversation history
+        conversation = []
+        for user, assistant in history:
+            conversation.extend([
+                {"role": "user", "content": str(user)},
+                {"role": "assistant", "content": str(assistant)}
+            ])
+        # 3. Construct the final prompt
+        final_message = ""
+        if citation:
+            final_message = f"{background_prompt}\nBased on these references:\n{citation}\nPlease answer: {message}"
+        else:
+            final_message = f"{background_prompt}\n{message}"
+        conversation.append({"role": "user", "content": final_message})
+        # 4. Prepare model inputs
+        input_ids = tokenizer.apply_chat_template(
+            conversation,
+            return_tensors="pt"
+        ).to(model.device)
+        # 5. Setup streamer
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            timeout=10.0,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        # 6. Configure generation parameters
+        generation_config = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": temperature > 0,
+            "temperature": temperature,
+            "eos_token_id": terminators
+        }
+        # 7. Generate in a separate thread
+        thread = Thread(target=model.generate, kwargs=generation_config)
+        thread.start()
+        # 8. Stream the output
+        accumulated_text = []
+        final_chunk = False
+        for text_chunk in streamer:
+            accumulated_text.append(text_chunk)
+            current_response = "".join(accumulated_text)
+            # Check if this is the last chunk
+            try:
+                next_chunk = next(iter(streamer))
+                accumulated_text.append(next_chunk)
+            except (StopIteration, RuntimeError):
+                final_chunk = True
+            # Add citations on the final chunk if they exist
+            if final_chunk and citation:
+                formatted_citations = "\n\nReferences:\n" + "\n".join(
+                    f"[{i+1}] {cite.strip()}"
+                    for i, cite in enumerate(citation.split('\n'))
+                    if cite.strip()
+                )
+                current_response += formatted_citations
+            yield current_response
+    except Exception as e:
+        error_message = f"An error occurred: {str(e)}"
+        print(error_message)  # For logging
+        yield error_message
 # Gradio block