Spaces:

arabellastrange
/

search-assistant

Sleeping

File size: 3,382 Bytes

6855b1e

import json
import random
import traceback

from llmsearch import utilityV2 as ut


def findnth(haystack, needle, n):
    parts = haystack.split(needle, n + 1)
    if len(parts) <= n + 1:
        return -1
    return len(haystack) - len(parts[-1]) - len(needle)


def extract_site(url):
    site = ""
    base = findnth(url, "/", 2)
    if base > 2:
        site = url[:base].split(".")
    if len(site) > 1:
        site = site[-2]
    site = site.replace("https://", "")
    site = site.replace("http://", "")
    return site


site_stats = {}  # initialize dictionay of sites used
stats_loaded = False
stats_dirty = False


def open_site_stats():
    global site_stats, stats_loaded, stats_dirty
    if stats_loaded:
        return
    try:
        with open("site_stats.json", "r") as f:
            site_stats = json.loads(f.read())
    except:
        print("Failed to read site_stats.json")
        traceback.print_exc()


def ckpt():
    global site_stats, stats_dirty
    if not stats_dirty:
        return
    try:
        with open("site_stats.json", "w") as ss:
            ss.write(json.dumps(site_stats))
        stats_dirty = False
    except Exception as e:
        print(f"Failed to write site_stats: {str(e)}")
        traceback.print_exc()


def update_site_stats(site, char_cnt, get_time, extract_time, openai_time):
    global site_stats, stats_dirty
    open_site_stats()
    if site not in site_stats:
        site_stats[site] = {
            "name": site,
            "hits": 0,
            "chars": 0,
            "get": 0,
            "extract": 0,
            "openai": 0,
        }
    if "hits" not in site_stats[site]:
        site_stats[site]["hits"] = 0
    site_stats[site]["hits"] = site_stats[site]["hits"] + 1
    site_stats[site]["chars"] = char_cnt + site_stats[site]["chars"]
    site_stats[site]["get"] = get_time + site_stats[site]["get"]
    site_stats[site]["extract"] = extract_time + site_stats[site]["extract"]
    site_stats[site]["openai"] = openai_time + site_stats[site]["openai"]
    stats_dirty = True
    # print("updated", site_stats[site])


def retrieve(site):
    global site_stats
    if site not in site_stats:
        site_stats[site] = {
            "name": site,
            "hits": 0,
            "chars": 0,
            "get": 0,
            "extract": 0,
            "openai": 0,
        }
    return site_stats[site]


def get_next(urls, sample_unknown=False):
    global site_stats
    # retrieve stats for sites in list
    candidates = []
    for url in urls:
        site = extract_site(url)
        candidate = retrieve(site)
        if sample_unknown or (site in ut.sites and ut.sites[site] != 0):
            candidates.append((candidate, url))
    if len(candidates) == 0:
        return []
    if len(candidates) == 1:
        return candidates[0]
    # random or ordered? if random, pick without sorting
    if random.random() > 0.85:
        pick = int(random.random() * len(candidates))
        return candidates[pick]

    # ordered, sort and compute cumulative
    candidates.sort(
        reverse=True,
        key=lambda item: (
                (item[0]["chars"] * 1000000)
                / (max(1000, item[0]["get"] + item[0]["extract"] + item[0]["openai"]))
        ),
    )

    # now pick top from sort
    p = random.random()
    p2 = p * p * p
    return candidates[int(p2 * len(candidates))]