Spaces:
Runtime error
Runtime error
File size: 5,704 Bytes
096a82e 9889763 68be317 096a82e d40a755 68be317 cf494b2 096a82e cf494b2 096a82e 845df70 68be317 3ed7c41 096a82e 3ed7c41 871af30 cf494b2 cba50c7 adbdb15 3ed7c41 cba50c7 68b08cf 871af30 68b08cf cba50c7 096a82e a8fb57f 096a82e a8fb57f b036a52 a8fb57f 096a82e 68be317 68b08cf 68be317 d40a755 68b08cf cf494b2 871af30 cba50c7 9889763 cba50c7 adbdb15 9889763 adbdb15 cba50c7 68b08cf 096a82e a8fb57f 096a82e a8fb57f 096a82e 68b08cf 2e3d9b7 |
|
import pyterrier as pt
pt.init()
import numpy as np
import pandas as pd
import gradio as gr
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
MODEL = 'macavaney/doc2query-t5-base-msmarco'
SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31'
PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00])
COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)']
doc2query = Doc2Query(MODEL, append=True, num_samples=5)
electra = ElectraScorer()
query_scorer = QueryScorer(electra)
query_filter = QueryFilter(t=0, append=False)
COLAB_NAME = 'pyterrier_doc2query.ipynb'
COLAB_INSTALL = '''
!pip install -q git+https://github.com/terrier-org/pyterrier
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
'''.strip()
COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu'
def predict(input, model, append, num_samples):
assert model == MODEL
doc2query.append = append
doc2query.num_samples = num_samples
code = f'''import pandas as pd
from pyterrier_doc2query import Doc2Query
doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
doc2query({df2code(input)})
'''
res = doc2query(input)
vis = generate_vis(res)
return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
def generate_vis(df):
result = []
for row in df.itertuples(index=False):
qs = []
if hasattr(row, 'querygen_score'):
for q, score in zip(row.querygen.split('\n'), row.querygen_score):
bucket = np.searchsorted(PERCENTILES_BY_5, score)
color = COLORS[bucket]
percentile = bucket * 5
qs.append(f'''
<div>
<span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q}
</div>
''')
elif hasattr(row, 'querygen'):
for q in row.querygen.split('\n'):
qs.append(f'''
<div>{q}</div>
''')
qs = '\n'.join(qs)
if qs:
qs = f'''
<div><strong>Expansion Queries:</strong></div>
{qs}
'''
text = row.text.replace('\n', '<br/>')
result.append(f'''
<div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div>
<div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;">
<div>
{text}
</div>
{qs}
</div>
''')
return '\n'.join(result)
def predict_mm(input, model, num_samples, score_model, filter_pct):
assert model == MODEL
assert score_model == SCORE_MODEL
doc2query.append = False
doc2query.num_samples = num_samples
if filter_pct > 0:
query_filter.t = PERCENTILES_BY_5[filter_pct//5-1]
pipeline = doc2query >> query_scorer >> query_filter
code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t={query_filter.t})
# use append=True when indexing; t={query_filter.t} is the {filter_pct}th percentile for generated queries on MS MARCO
pipeline({df2code(input)})
'''
else:
pipeline = doc2query >> query_scorer
code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer
from pyterrier_dr import ElectraScorer
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer)
pipeline({df2code(input)})
'''
res = pipeline(input)
vis = generate_vis(res)
res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]')
return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis)
interface(
MarkdownFile('README.md'),
Demo(
predict,
EX_D,
[
gr.Dropdown(
choices=[MODEL],
value=MODEL,
label='Model',
interactive=False,
), gr.Checkbox(
value=doc2query.append,
label="Append",
), gr.Slider(
minimum=1,
maximum=10,
value=doc2query.num_samples,
step=1.,
label='# Queries'
)],
),
MarkdownFile('mm.md'),
Demo(
predict_mm,
EX_D,
[
gr.Dropdown(
choices=[MODEL],
value=MODEL,
label='Model',
interactive=False,
), gr.Slider(
minimum=1,
maximum=10,
value=doc2query.num_samples,
step=1.,
label='# Queries'
), gr.Dropdown(
choices=[SCORE_MODEL],
value=SCORE_MODEL,
label='Scorer',
interactive=False,
), gr.Slider(
minimum=0,
maximum=95,
value=10,
step=5,
label='Filter (top % of queries)'
)],
),
MarkdownFile('wrapup.md'),
).launch(share=False)
|