File size: 10,891 Bytes
cf7c183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
This is the template for implementing the rankers for your search engine.
You will be implementing WordCountCosineSimilarity, DirichletLM, TF-IDF, BM25, Pivoted Normalization, and your own ranker.
"""
from collections import Counter, defaultdict
from indexing import InvertedIndex
import math


class Ranker:
    """
    The ranker class is responsible for generating a list of documents for a given query, ordered by their scores
    using a particular relevance function (e.g., BM25).
    A Ranker can be configured with any RelevanceScorer.
    """
    # TODO: Implement this class properly; this is responsible for returning a list of sorted relevant documents
    def __init__(self, index: InvertedIndex, document_preprocessor, stopwords: set[str], scorer: 'RelevanceScorer') -> None:
        """
        Initializes the state of the Ranker object.

        TODO (HW3): Previous homeworks had you passing the class of the scorer to this function
        This has been changed as it created a lot of confusion.
        You should now pass an instantiated RelevanceScorer to this function.

        Args:
            index: An inverted index
            document_preprocessor: The DocumentPreprocessor to use for turning strings into tokens
            stopwords: The set of stopwords to use or None if no stopword filtering is to be done
            scorer: The RelevanceScorer object
        """
        self.index = index
        self.tokenize = document_preprocessor.tokenize
        self.scorer = scorer
        self.stopwords = stopwords

    def query(self, query: str) -> list[tuple[int, float]]:
        """
        Searches the collection for relevant documents to the query and
        returns a list of documents ordered by their relevance (most relevant first).

        Args:
            query: The query to search for

        Returns:
            A list of dictionary objects with keys "docid" and "score" where docid is
            a particular document in the collection and score is that document's relevance

        TODO (HW3): We are standardizing the query output of Ranker to match with L2RRanker.query and VectorRanker.query
        The query function should return a sorted list of tuples where each tuple has the first element as the document ID
        and the second element as the score of the document after the ranking process.
        """
        # TODO: Tokenize the query and remove stopwords, if needed
        tokens = self.tokenize(query)
        query_parts = [token for token in tokens if token not in self.stopwords] if self.stopwords else tokens
     
        # TODO: Fetch a list of possible documents from the index and create a mapping from
        #       a document ID to a dictionary of the counts of the query terms in that document.
        #       You will pass the dictionary to the RelevanceScorer as input.
        doc_word_counts = defaultdict(Counter)
        query_word_counts = Counter(query_parts)
        for term in query_word_counts:
            postings = self.index.get_postings(term)
            for posting in postings:
                doc_word_counts[posting[0]][term] = posting[1]

        # TODO: Rank the documents using a RelevanceScorer (like BM25 from below classes) 
        results = []
        for docid in doc_word_counts:
            res = self.scorer.score(docid, doc_word_counts[docid], query_word_counts)
            if res:
                results.append((docid, res))

        # TODO: Return the **sorted** results as format [{docid: 100, score:0.5}, {{docid: 10, score:0.2}}]
        results.sort(key=lambda x: x[1], reverse=True)
        return results


class RelevanceScorer:
    """
    This is the base interface for all the relevance scoring algorithm.
    It will take a document and attempt to assign a score to it.
    """
    # TODO: Implement the functions in the child classes (WordCountCosineSimilarity, DirichletLM, BM25,
    #  PivotedNormalization, TF_IDF) and not in this one

    def __init__(self, index: InvertedIndex, parameters) -> None:
        raise NotImplementedError

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int]) -> float:
        """
        Returns a score for how relevance is the document for the provided query.

        Args:
            docid: The ID of the document
            doc_word_counts: A dictionary containing all words in the document and their frequencies.
                Words that have been filtered will be None.
            query_word_counts: A dictionary containing all words in the query and their frequencies.
                Words that have been filtered will be None.

        Returns:
            A score for how relevant the document is (Higher scores are more relevant.)

        """
        raise NotImplementedError


# TODO (HW1): Implement unnormalized cosine similarity on word count vectors
class WordCountCosineSimilarity(RelevanceScorer):
    def __init__(self, index: InvertedIndex, parameters={}) -> None:
        self.index = index
        self.parameters = parameters

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int]) -> float:
        # 1. Find the dot product of the word count vector of the document and the word count vector of the query

        # 2. Return the score
        cwq = query_word_counts
        score = 0
        flag = 0
        for word in cwq:
            if word in doc_word_counts:
                flag = 1
                score += cwq[word] * doc_word_counts[word]
        if not flag:
            return None
        return score 


# TODO (HW1): Implement DirichletLM
class DirichletLM(RelevanceScorer):
    def __init__(self, index: InvertedIndex, parameters={'mu': 2000}) -> None:
        self.index = index
        self.parameters = parameters
        self.mu = parameters['mu']

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int]) -> float:
        # 1. Get necessary information from index

        # 2. Compute additional terms to use in algorithm

        # 3. For all query_parts, compute score

        # 4. Return the score
        cwq = query_word_counts
        q_len = sum(cwq.values())
        flag = 0
        score = 0
        
        for term in cwq:
            if term in doc_word_counts and docid in self.index.document_metadata:
                flag = 1
                pwc = self.index.get_term_metadata(term)['count']/self.index.statistics['total_token_count']
                first_part = cwq[term]*math.log(1+doc_word_counts[term]/(self.mu*pwc))
                score+=first_part
        if docid in self.index.document_metadata:
            second_part = q_len*math.log(self.mu/(self.mu+self.index.document_metadata[docid]['length']))
            score+=second_part
        if not flag:
            return None
        return score


# TODO (HW1): Implement BM25
class BM25(RelevanceScorer):
    def __init__(self, index: InvertedIndex, parameters={'b': 0.75, 'k1': 1.2, 'k3': 8}) -> None:
        self.index = index
        self.b = parameters['b']
        self.k1 = parameters['k1']
        self.k3 = parameters['k3']

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int])-> float:
        # 1. Get necessary information from index

        # 2. Find the dot product of the word count vector of the document and the word count vector of the query

        # 3. For all query parts, compute the TF and IDF to get a score

        # 4. Return score
        cwq = query_word_counts
        info = self.index.statistics # statistics
        avg_dl = info['mean_document_length']
        N = info['number_of_documents']
        score = 0
        flag = 0
        for term in cwq:
            if term in doc_word_counts and docid in self.index.document_metadata:
                flag = 1
                third_part = cwq[term]*(self.k3+1)/(self.k3+cwq[term])
                first_part = math.log((N+0.5-self.index.get_term_metadata(term)['document_count'])\
                                      /(self.index.get_term_metadata(term)['document_count']+0.5))
                ctd = doc_word_counts[term]
                second_part = ((self.k1+1)*ctd)\
                    /(self.k1*(1-self.b+self.b*self.index.document_metadata[docid]['length']/avg_dl)+ctd)
                score+=first_part*second_part*third_part
        if not flag:
            return None
        return score


# TODO (HW1): Implement Pivoted Normalization
class PivotedNormalization(RelevanceScorer):
    def __init__(self, index: InvertedIndex, parameters={'b': 0.2}) -> None:
        self.index = index
        self.b = parameters['b']

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int]) -> float:
        # 1. Get necessary information from index

        # 2. Compute additional terms to use in algorithm

        # 3. For all query parts, compute the TF, IDF, and QTF values to get a score

        # 4. Return the score
        cwq = query_word_counts
        info = self.index.statistics # statistics
        avg_dl = info['mean_document_length']
        N = info['number_of_documents']
        score = 0
        flag = 0
        for term in cwq:
            if term in doc_word_counts and docid in self.index.document_metadata:
                flag = 1
                first_part = cwq[term]
                third_part = math.log((N+1)/self.index.get_term_metadata(term)['document_count'])
                second_part = (1+math.log(1+math.log(doc_word_counts[term])))\
                    /(1-self.b+self.b*self.index.document_metadata[docid]['length']/avg_dl)
                # print(first_part, second_part, third_part)
                score+=first_part*second_part*third_part
        if not flag:
            return None
        return score


# TODO (HW1): Implement TF-IDF
class TF_IDF(RelevanceScorer):
    def __init__(self, index: InvertedIndex, parameters={}) -> None:
        self.index = index
        self.parameters = parameters

    def score(self, docid: int, doc_word_counts: dict[str, int], query_word_counts: dict[str, int]) -> float:
        # 1. Get necessary information from index

        # 2. Compute additional terms to use in algorithm

        # 3. For all query parts, compute the TF, IDF, and QTF values to get a score

        # 4. Return the score
        cwq = query_word_counts
        doc_total = self.index.statistics['number_of_documents']  # statistics
        score = 0
        flag = 0
        for term in cwq:
            if term in doc_word_counts:
                flag = 1
                score += math.log(doc_word_counts[term]+1)*\
                    (1+math.log(doc_total/(self.index.get_term_metadata(term)['document_count'])))*cwq[term]
        if not flag:
            return None
        return score