Improve generation quality without retraining: add candidate reranking, few-shot prompts, aggressive filtering, and quality scoring

Browse files

Files changed (1) hide show

query_slm.py +279 -31

query_slm.py CHANGED Viewed

@@ -72,20 +72,24 @@ class LegalSLM:
     def generate_answer(
         self,
         question: str,
-        temperature: float = 0.4,
-        max_length: int = 250,
-        top_p: float = 0.9,
-        top_k: int = 50
     ) -> str:
         """
-        Generate an answer to a legal question.
         Args:
             question: The legal question to answer
             temperature: Sampling temperature (lower = more deterministic)
-            max_length: Maximum length of generated response
             top_p: Nucleus sampling parameter
             top_k: Top-k sampling parameter
         Returns:
             Generated answer text
@@ -101,53 +105,277 @@ class LegalSLM:
             raise ValueError("Temperature must be between 0.0 and 2.0")
         if max_length < 1 or max_length > 1000:
             raise ValueError("max_length must be between 1 and 1000")
-        # Build prompt
-        prompt = f"Based on Australian legal documents, answer the following.\n\nQuestion: {sanitized_question}\nAnswer:"
-        # Tokenize prompt with attention mask to fix the warning
-        # Using tokenizer() instead of encode() to get attention_mask automatically
         tokenized = self.tokenizer(
             prompt,
             return_tensors='pt',
-            padding=False,  # No padding needed for single input
             truncation=False
         )
         input_ids = tokenized['input_ids'].to(self.device)
         attention_mask = tokenized['attention_mask'].to(self.device)
-        # Generate
         with torch.no_grad():
             outputs = self.model.generate(
                 input_ids,
-                attention_mask=attention_mask,  # Pass attention mask to fix warning
-                max_length=input_ids.shape[1] + max_length,
                 temperature=temperature,
                 top_p=top_p,
                 top_k=top_k,
                 do_sample=True,
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id,
-                repetition_penalty=1.2,  # Reduce repetition
             )
-        # Decode response
-        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract just the answer part (after "Answer:")
         if "Answer:" in full_response:
-            answer = full_response.split("Answer:")[-1].strip()
-        else:
-            # Fallback: return everything after the prompt
-            # Safety check: ensure prompt is not longer than response
-            if len(prompt) <= len(full_response):
-                answer = full_response[len(prompt):].strip()
             else:
-                # If prompt is longer (shouldn't happen, but handle gracefully)
                 answer = full_response.strip()
         return answer
     def interactive_query(self):
         """Run interactive query loop."""
         print("\n" + "=" * 80)
@@ -169,7 +397,14 @@ class LegalSLM:
                 print("\nGenerating answer...")
                 try:
-                    answer = self.generate_answer(question)
                     print(f"\nAnswer: {answer}\n")
                 except ValueError as e:
                     print(f"\nInvalid input: {e}\n")
@@ -207,14 +442,25 @@ def main():
     parser.add_argument(
         '--temperature',
         type=float,
-        default=0.4,
-        help='Sampling temperature (default: 0.4)'
     )
     parser.add_argument(
         '--max-length',
         type=int,
-        default=250,
-        help='Maximum response length in tokens (default: 250)'
     )
     args = parser.parse_args()
@@ -250,7 +496,9 @@ def main():
             answer = slm.generate_answer(
                 args.question,
                 temperature=args.temperature,
-                max_length=args.max_length
             )
             print(f"\nQuestion: {args.question}")
             print(f"Answer: {answer}\n")

     def generate_answer(
         self,
         question: str,
+        temperature: float = 0.2,
+        max_length: int = 200,
+        num_candidates: int = 3,
+        top_p: float = 0.85,
+        top_k: int = 30,
+        use_reranking: bool = True
     ) -> str:
         """
+        Generate an answer to a legal question with quality improvements.
         Args:
             question: The legal question to answer
             temperature: Sampling temperature (lower = more deterministic)
+            max_length: Maximum length of generated response in tokens
+            num_candidates: Number of candidates to generate for reranking
             top_p: Nucleus sampling parameter
             top_k: Top-k sampling parameter
+            use_reranking: If True, generate multiple candidates and pick best
         Returns:
             Generated answer text
             raise ValueError("Temperature must be between 0.0 and 2.0")
         if max_length < 1 or max_length > 1000:
             raise ValueError("max_length must be between 1 and 1000")
+        if num_candidates < 1 or num_candidates > 10:
+            raise ValueError("num_candidates must be between 1 and 10")
+        # Build prompt with few-shot examples for better quality
+        prompt = self._build_few_shot_prompt(sanitized_question)
+        # Tokenize prompt with attention mask
         tokenized = self.tokenizer(
             prompt,
             return_tensors='pt',
+            padding=False,
             truncation=False
         )
         input_ids = tokenized['input_ids'].to(self.device)
         attention_mask = tokenized['attention_mask'].to(self.device)
+        # Generate candidates
+        num_to_generate = num_candidates if use_reranking else 1
         with torch.no_grad():
             outputs = self.model.generate(
                 input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_length,
+                num_return_sequences=num_to_generate,
+                min_new_tokens=30,  # Force minimum answer length
                 temperature=temperature,
                 top_p=top_p,
                 top_k=top_k,
                 do_sample=True,
                 pad_token_id=self.tokenizer.pad_token_id,
                 eos_token_id=self.tokenizer.eos_token_id,
+                repetition_penalty=1.4,  # Higher to reduce repetition
+                no_repeat_ngram_size=4,  # Prevent 4-gram repetition
+                early_stopping=False,
             )
+        # Extract and process candidates
+        candidates = []
+        for output in outputs:
+            full_response = self.tokenizer.decode(output, skip_special_tokens=True)
+            answer = self._extract_answer(full_response, prompt)
+            answer = self._clean_and_filter_answer(answer)
+            # Only consider answers that pass quality checks
+            if answer and len(answer.strip()) > 30:
+                if use_reranking:
+                    score = self._score_answer_quality(answer)
+                    candidates.append((score, answer))
+                else:
+                    # Return first valid answer if not reranking
+                    return self._find_natural_stopping_point(answer, max_chars=600)
+        # If no valid candidates, return fallback
+        if not candidates:
+            return "I cannot provide a reliable answer to this question based on the available information."
+        # Return best candidate based on quality score
+        candidates.sort(key=lambda x: x[0], reverse=True)
+        best_answer = candidates[0][1]
+        # Final cleanup and length limit
+        best_answer = self._find_natural_stopping_point(best_answer, max_chars=600)
+        return best_answer
+    def _build_few_shot_prompt(self, question: str) -> str:
+        """
+        Build prompt with few-shot examples to guide the model.
+        Args:
+            question: The user's question
+        Returns:
+            Formatted prompt with examples
+        """
+        # Few-shot examples that demonstrate good answer format
+        examples = [
+            ("What is negligence in Australian law?",
+             "Negligence is a legal concept in Australian law that requires a duty of care, breach of that duty, and resulting damage."),
+            ("What is a contract?",
+             "A contract is a legally binding agreement between parties that creates mutual obligations enforceable by law."),
+        ]
+        prompt_parts = []
+        for q, a in examples:
+            prompt_parts.append(f"Question: {q}\nAnswer: {a}")
+        # Add the actual question
+        prompt_parts.append(f"Question: {question}\nAnswer:")
+        return "\n\n".join(prompt_parts)
+    def _extract_answer(self, full_response: str, prompt: str) -> str:
+        """
+        Extract answer from full response.
+        Args:
+            full_response: Complete model response
+            prompt: Original prompt
+        Returns:
+            Extracted answer text
+        """
+        # Try multiple extraction methods
         if "Answer:" in full_response:
+            # Split by "Answer:" and take the last part (in case of multiple)
+            parts = full_response.split("Answer:")
+            if len(parts) > 1:
+                answer = parts[-1].strip()
             else:
                 answer = full_response.strip()
+        else:
+            # Fallback: remove prompt from response
+            answer = full_response.replace(prompt, "").strip()
         return answer
+    def _clean_and_filter_answer(self, answer: str) -> str:
+        """
+        Aggressively clean and filter gibberish from answer.
+        Args:
+            answer: Raw answer text
+        Returns:
+            Cleaned answer or empty string if too poor quality
+        """
+        if not answer:
+            return ""
+        # Remove problematic prefixes that don't add value
+        problematic_starts = [
+            ("Yes.", 4),
+            ("Yes,", 4),
+            ("Yes ", 4),
+            ("I do not know", 13),
+            ("I am not sure", 13),
+            ("I do and will not", 17),
+        ]
+        for prefix, length in problematic_starts:
+            if answer.strip().startswith(prefix):
+                after = answer[length:].strip()
+                # Only remove if there's substantial content after
+                if len(after) > 30:
+                    answer = after
+                    break
+        # Remove everything after rambling markers
+        rambling_markers = ['---', '???', '...', '?---', '\n\nQuestion:', '\nQuestion:']
+        for marker in rambling_markers:
+            if marker in answer:
+                idx = answer.find(marker)
+                answer = answer[:idx].strip()
+        # Remove excessive whitespace
+        answer = ' '.join(answer.split())
+        # Remove incomplete sentences at the end
+        # Keep only complete sentences (ending with . ! or ?)
+        sentences = re.split(r'([.!?]\s+)', answer)
+        if len(sentences) > 1:
+            cleaned = []
+            for i in range(0, len(sentences) - 1, 2):
+                if i + 1 < len(sentences):
+                    cleaned.append(sentences[i] + sentences[i + 1])
+            if cleaned:
+                answer = ''.join(cleaned).strip()
+        # Filter out if too short
+        if len(answer) < 30:
+            return ""
+        # Check for excessive repetition (gibberish indicator)
+        words = answer.split()
+        if len(words) > 0:
+            unique_ratio = len(set(words)) / len(words)
+            if unique_ratio < 0.3:  # More than 70% repetition
+                return ""
+        return answer
+    def _score_answer_quality(self, answer: str) -> float:
+        """
+        Score answer quality (higher is better).
+        Args:
+            answer: Answer text to score
+        Returns:
+            Quality score
+        """
+        if not answer or len(answer) < 20:
+            return -100
+        score = 0
+        # Reward reasonable length (sweet spot around 200-400 chars)
+        length = len(answer)
+        if 100 <= length <= 500:
+            score += 50
+        elif 50 <= length < 100:
+            score += 30
+        elif length > 500:
+            score += 40  # Slightly less for very long
+        else:
+            score -= 20
+        # Penalize common gibberish patterns
+        gibberish_patterns = ['---', '???', '...', '?---', 'I do not know', 'I am not sure']
+        for pattern in gibberish_patterns:
+            if pattern in answer:
+                score -= 30
+        # Penalize if starts with "Yes." and nothing substantial
+        if answer.strip().startswith("Yes.") and len(answer.strip()) < 50:
+            score -= 40
+        # Reward complete sentences
+        sentence_count = answer.count('. ') + answer.count('? ') + answer.count('! ')
+        score += min(sentence_count * 5, 30)
+        # Reward diversity (less repetition)
+        words = answer.split()
+        if len(words) > 0:
+            unique_ratio = len(set(words)) / len(words)
+            score += unique_ratio * 30
+        # Penalize excessive question marks (uncertainty)
+        if answer.count('?') > 3:
+            score -= 20
+        # Reward legal terminology (common legal words)
+        legal_terms = ['law', 'legal', 'court', 'act', 'section', 'australia', 'australian',
+                      'right', 'obligation', 'contract', 'negligence', 'liability', 'duty']
+        term_count = sum(1 for term in legal_terms if term.lower() in answer.lower())
+        score += min(term_count * 3, 15)
+        return score
+    def _find_natural_stopping_point(self, text: str, max_chars: int = 600) -> str:
+        """
+        Find a natural stopping point in text to prevent abrupt cuts.
+        Args:
+            text: Text to truncate
+            max_chars: Maximum character length
+        Returns:
+            Text truncated at natural boundary
+        """
+        if len(text) <= max_chars:
+            return text
+        # Try to cut at sentence boundary
+        truncated = text[:max_chars]
+        sentence_endings = ['. ', '.\n', '? ', '!\n', '! ']
+        for ending in sentence_endings:
+            idx = truncated.rfind(ending)
+            # If found in last 30% of truncated text, use it
+            if idx > max_chars * 0.7:
+                return truncated[:idx + 1].strip()
+        # Fallback: cut at word boundary
+        words = truncated.rsplit(' ', 1)
+        if len(words) > 1:
+            return words[0] + '...'
+        return truncated + '...'
     def interactive_query(self):
         """Run interactive query loop."""
         print("\n" + "=" * 80)
                 print("\nGenerating answer...")
                 try:
+                    # Use reranking by default for better quality
+                    answer = self.generate_answer(
+                        question,
+                        temperature=0.2,
+                        max_length=200,
+                        num_candidates=3,
+                        use_reranking=True
+                    )
                     print(f"\nAnswer: {answer}\n")
                 except ValueError as e:
                     print(f"\nInvalid input: {e}\n")
     parser.add_argument(
         '--temperature',
         type=float,
+        default=0.2,
+        help='Sampling temperature (default: 0.2, lower = more deterministic)'
     )
     parser.add_argument(
         '--max-length',
         type=int,
+        default=200,
+        help='Maximum response length in tokens (default: 200)'
+    )
+    parser.add_argument(
+        '--num-candidates',
+        type=int,
+        default=3,
+        help='Number of candidates to generate for reranking (default: 3)'
+    )
+    parser.add_argument(
+        '--no-reranking',
+        action='store_true',
+        help='Disable candidate reranking (faster but lower quality)'
     )
     args = parser.parse_args()
             answer = slm.generate_answer(
                 args.question,
                 temperature=args.temperature,
+                max_length=args.max_length,
+                num_candidates=args.num_candidates,
+                use_reranking=not args.no_reranking
             )
             print(f"\nQuestion: {args.question}")
             print(f"Answer: {answer}\n")