File size: 1,840 Bytes
1dfccc3
 
d812385
 
 
 
 
 
 
 
 
 
1dfccc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628fe8f
d812385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import numpy as np


MAX_USER_QUERY_LEN = 35

# List of example queries for easy access
DEFAULT_QUERIES = {
    "Example Query 1": "Who visited microsoft.com on September 18?",
    "Example Query 2": "Does Kate has drive ?",
    "Example Query 3": "What phone number can be used to contact David Johnson?",
}

def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
    """
    Get mean-pooled representations of given texts in batches.
    """
    mean_pooled_batch = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=False)
        last_hidden_states = outputs.last_hidden_state
        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        mean_pooled = sum_embeddings / sum_mask
        mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy())
    return np.array(mean_pooled_batch)


def is_user_query_valid(user_query: str) -> bool:
    """
    Check if the `user_query` is None and not empty.
    Args:
        user_query (str): The input text to be checked.
    Returns:
        bool: True if the `user_query` is None or empty, False otherwise.
    """
    # If the query is not part of the default queries
    is_default_query = user_query in DEFAULT_QUERIES.values() 
    
    # Check if the query exceeds the length limit
    is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN
    
    return not is_default_query and not is_exceeded_max_length