tim1900
/

bert-chunker

@@ -3,6 +3,7 @@ from torch import nn
 from transformers.models.bert.configuration_bert import BertConfig
 from transformers.models.bert.modeling_bert import BertModel
 import torch
 class BertChunker(PreTrainedModel):
     config_class = BertConfig
@@ -36,7 +37,7 @@ class BertChunker(PreTrainedModel):
         return model_output
-    def chunk_text(self, text:str, tokenizer,threshold=0)->list[str]:
     # slide context window
         MAX_TOKENS=255
         tokens=tokenizer(text, return_tensors="pt",truncation=False)
@@ -61,8 +62,10 @@ class BertChunker(PreTrainedModel):
             output=self(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1]))
             logits = output['logits'][:, 1:-1,:]
-            is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
-            greater_rows_indices = torch.where(is_left_greater)[1].tolist()
             # null or not
             if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
@@ -81,7 +84,7 @@ class BertChunker(PreTrainedModel):
         return substrings
     def chunk_text_fast(
-        self, text: str, tokenizer, batchsize=20, threshold=0
     ) -> list[str]:
     # chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
         self.eval()
@@ -129,8 +132,12 @@ class BertChunker(PreTrainedModel):
                 attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
                 output = self(input_ids=batch_input, attention_mask=attention_mask)
                 logits = output['logits'][:, 1:-1,:]#delete cls and sep
-                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
-                pos = is_left_greater * position_id[i : i + batchsize, :]
                 pos = pos[pos>0].tolist()
                 split_str_poses += [tokens.token_to_chars(p).start for p in pos]
             if left_seq_num > 0:
@@ -138,8 +145,9 @@ class BertChunker(PreTrainedModel):
                 attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
                 output = self(input_ids=batch_input, attention_mask=attention_mask)
                 logits = output['logits'][:, 1:-1,:]#delete cls and sep
-                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
-                pos = is_left_greater * position_id[-left_seq_num:, :]
                 pos = pos[pos>0].tolist()
                 split_str_poses += [tokens.token_to_chars(p).start for p in pos]
@@ -149,9 +157,10 @@ class BertChunker(PreTrainedModel):
             attention_mask = torch.ones(left_input_ids.shape[0], left_input_ids.shape[1]).to(self.device)
             output = self(input_ids=left_input_ids, attention_mask=attention_mask)
             logits = output['logits'][:, 1:-1,:]#delete cls and sep
-            is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
             bias = token_num - (left_input_ids.shape[1] - 2) + 1
-            pos = (torch.where(is_left_greater)[1] + bias).tolist()
             split_str_poses += [tokens.token_to_chars(p).start for p in pos]
         substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]

 from transformers.models.bert.configuration_bert import BertConfig
 from transformers.models.bert.modeling_bert import BertModel
 import torch
+import torch.nn.functional as F
 class BertChunker(PreTrainedModel):
     config_class = BertConfig
         return model_output
+    def chunk_text(self, text:str, tokenizer, prob_threshold=0.5)->list[str]:
     # slide context window
         MAX_TOKENS=255
         tokens=tokenizer(text, return_tensors="pt",truncation=False)
             output=self(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1]))
             logits = output['logits'][:, 1:-1,:]
+            chunk_probabilities = F.softmax(logits, dim=-1)[:,:,1]
+            chunk_decision = (chunk_probabilities>prob_threshold)
+            greater_rows_indices = torch.where(chunk_decision)[1].tolist()
             # null or not
             if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
         return substrings
     def chunk_text_fast(
+        self, text: str, tokenizer, batchsize=20, prob_threshold=0.5
     ) -> list[str]:
     # chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
         self.eval()
                 attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
                 output = self(input_ids=batch_input, attention_mask=attention_mask)
                 logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                # is_left_greater = ((logits[:,:, 0] + 0) < logits[:,:, 1])
+                chunk_probabilities = F.softmax(logits, dim=-1)[:,:,1]
+                chunk_decision = (chunk_probabilities>prob_threshold)
+                pos = chunk_decision * position_id[i : i + batchsize, :]
                 pos = pos[pos>0].tolist()
                 split_str_poses += [tokens.token_to_chars(p).start for p in pos]
             if left_seq_num > 0:
                 attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
                 output = self(input_ids=batch_input, attention_mask=attention_mask)
                 logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                chunk_probabilities = F.softmax(logits, dim=-1)[:,:,1]
+                chunk_decision = (chunk_probabilities>prob_threshold)
+                pos = chunk_decision * position_id[-left_seq_num:, :]
                 pos = pos[pos>0].tolist()
                 split_str_poses += [tokens.token_to_chars(p).start for p in pos]
             attention_mask = torch.ones(left_input_ids.shape[0], left_input_ids.shape[1]).to(self.device)
             output = self(input_ids=left_input_ids, attention_mask=attention_mask)
             logits = output['logits'][:, 1:-1,:]#delete cls and sep
+            chunk_probabilities = F.softmax(logits, dim=-1)[:,:,1]
+            chunk_decision = (chunk_probabilities>prob_threshold)
             bias = token_num - (left_input_ids.shape[1] - 2) + 1
+            pos = (torch.where(chunk_decision)[1] + bias).tolist()
             split_str_poses += [tokens.token_to_chars(p).start for p in pos]
         substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]