Spaces:

vectara
/

media-demo

Running

App Files Files Community

media-demo / query.py

ofermend

Upload 4 files

2d02ed4 verified 7 months ago

raw

history blame contribute delete

5.26 kB

	import requests
	import json

	class VectaraQuery():
	def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str]):
	self.customer_id = customer_id
	self.corpus_ids = corpus_ids
	self.api_key = api_key
	self.START_TAG = "<em_start>"
	self.END_TAG = "<em_end>"
	self.prompt_name = "vectara-summary-ext-24-05-med"
	self.prompt_text = '''
	[{"role": "system", "content": "Follow these detailed step-by-step instructions, your task is to generate an accurate and coherent summary of the first search result.
	- You will receive a single search result enclosed in triple quotes, which includes part of a script from a movie.
	- the search result can be a part of a larger movie scence, and may be incomplete.
	- the text is a sequence of subtitles from the movie itself.
	- Base your summary only on the information provided in the search result, do not use any other sources.
	- Do no include the word summary in your response, just the summary itself.
	- Summarize the scene including who the characters are, what they do and any other important detail."},
	{"role": "user", "content": "#foreach ($qResult in $vectaraQueryResults) Search Result $esc.java($foreach.index + 1): \'\'\'$esc.java($qResult.text())\'\'\'.#end"}
	]
	'''

	def get_body(self, query_str: str, filter: str = None, summarize: bool = True):
	corpora_key_list = [{
	'customerId': self.customer_id, 'corpusId': corpus_id, 'lexicalInterpolationConfig': {'lambda': 0.005}
	} for corpus_id in self.corpus_ids
	]
	if filter:
	for key in corpora_key_list:
	key['filter'] = filter

	sent_before = 15 if summarize else 1
	sent_after = 15 if summarize else 1
	body = {
	'query': [
	{
	'query': query_str,
	'start': 0,
	'numResults': 50,
	'corpusKey': corpora_key_list,
	'contextConfig': {
	'sentences_before': sent_before,
	'sentences_after': sent_after,
	'start_tag': self.START_TAG,
	'end_tag': self.END_TAG
	},
	}
	]
	}
	if summarize:
	body['query'][0]['summary'] = [
	{
	'responseLang': 'eng',
	'maxSummarizedResults': 1,
	'summarizerPromptName': self.prompt_name,
	'promptText': self.prompt_text
	}
	]
	else:
	body['query'][0]['rerankingConfig'] = { 'rerankerId': 272725719 } # rerank only in main query, not when summarizing

	return body


	def get_headers(self):
	return {
	"Content-Type": "application/json",
	"Accept": "application/json",
	"customer-id": self.customer_id,
	"x-api-key": self.api_key,
	"grpc-timeout": "60S"
	}

	def submit_query(self, query_str: str):

	endpoint = "https://api.vectara.io/v1/query"
	body = self.get_body(query_str, filter=None, summarize=False)

	response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
	if response.status_code != 200:
	print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
	return "Sorry, something went wrong in my brain. Please try again later."

	res = response.json()
	top_k = 3
	responses = res['responseSet'][0]['response'][:top_k]
	documents = res['responseSet'][0]['document']

	metadatas = []
	for x in responses:
	md = {m["name"]: m["value"] for m in x["metadata"]}
	doc_num = x["documentIndex"]
	doc_id = documents[doc_num]["id"]
	md['doc_id'] = doc_id
	doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
	md.update(doc_md)
	metadatas.append(md)

	movie_title = metadatas[0].get("doc_title", None)
	snippet_url = metadatas[0].get("url", None)
	score = responses[0]["score"]
	doc_id = metadatas[0]["doc_id"]
	matching_text = responses[0]["text"].split(self.START_TAG)[1].split(self.END_TAG)[0].strip()

	return movie_title, snippet_url, score, doc_id, matching_text

	def get_summary(self, query_str: str, doc_id: str):

	endpoint = "https://api.vectara.io/v1/query"
	filter = f"doc.id == '{doc_id}'"
	body = self.get_body(query_str, filter, summarize=True)

	response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
	if response.status_code != 200:
	print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
	return "Sorry, something went wrong in my brain. Please try again later."

	res = response.json()
	summary = res['responseSet'][0]['summary'][0]['text']

	return summary