tim1900
/

bert-chunker

Token Classification

feature-extraction

Inference Endpoints

Model card Files Files and versions Community

tim1900 commited on Jun 5

Commit

7964b25

•

1 Parent(s): 9657071

Update README.md

Files changed (1) hide show

README.md +14 -6

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ from modeling_bertchunker import BertChunker
 # load bert tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
-    "./",
     padding_side="right",
     model_max_length=255,
     trust_remote_code=True,
@@ -35,17 +35,17 @@ tokenizer = AutoTokenizer.from_pretrained(
 # load MiniLM-L6-H384-uncased bert config
 config = AutoConfig.from_pretrained(
-    "./",
     trust_remote_code=True,
 )
 # initialize model
 model = BertChunker(config)
-device='cpu'
 model.to(device)
 # load parameters
-state_dict = safetensors.torch.load_file("./model.safetensors")
 model.load_state_dict(state_dict)
 # text to be chunked
@@ -57,12 +57,20 @@ text="In the heart of the bustling city, where towering skyscrapers touch the cl
     With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
      understanding of life in the universe grew ever stronger."
-# chunk the text. The lower threshold is, the more chunks will be generated.
 chunks=model.chunk_text(text, tokenizer, threshold=0)
 # print chunks
 for i, c in enumerate(chunks):
-    print(f'------------------')
     print(c)
 ```

 # load bert tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
+    "sentence-transformers/all-MiniLM-L6-v2",
     padding_side="right",
     model_max_length=255,
     trust_remote_code=True,
 # load MiniLM-L6-H384-uncased bert config
 config = AutoConfig.from_pretrained(
+    "sentence-transformers/all-MiniLM-L6-v2",
     trust_remote_code=True,
 )
 # initialize model
 model = BertChunker(config)
+device='cuda'
 model.to(device)
 # load parameters
+state_dict = safetensors.torch.load_file("outputModels/checkpoint-3750/model.safetensors")
 model.load_state_dict(state_dict)
 # text to be chunked
     With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
      understanding of life in the universe grew ever stronger."
+# chunk the text. The lower threshold is, the more chunks will be generated. Can be negative or positive.
 chunks=model.chunk_text(text, tokenizer, threshold=0)
 # print chunks
 for i, c in enumerate(chunks):
+    print(f'-----chunk: {i}------------')
     print(c)
+print('----->Here is the result of fast chunk method<------:')
+# chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
+chunks=model.chunk_text_fast(text, tokenizer, batchsize=20, threshold=0)
+# print chunks
+for i, c in enumerate(chunks):
+    print(f'-----chunk: {i}------------')
+    print(c)
 ```