Update README.md
Browse files
README.md
CHANGED
@@ -27,7 +27,7 @@ from modeling_bertchunker import BertChunker
|
|
27 |
|
28 |
# load bert tokenizer
|
29 |
tokenizer = AutoTokenizer.from_pretrained(
|
30 |
-
"
|
31 |
padding_side="right",
|
32 |
model_max_length=255,
|
33 |
trust_remote_code=True,
|
@@ -35,17 +35,17 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
35 |
|
36 |
# load MiniLM-L6-H384-uncased bert config
|
37 |
config = AutoConfig.from_pretrained(
|
38 |
-
"
|
39 |
trust_remote_code=True,
|
40 |
)
|
41 |
|
42 |
# initialize model
|
43 |
model = BertChunker(config)
|
44 |
-
device='
|
45 |
model.to(device)
|
46 |
|
47 |
# load parameters
|
48 |
-
state_dict = safetensors.torch.load_file("
|
49 |
model.load_state_dict(state_dict)
|
50 |
|
51 |
# text to be chunked
|
@@ -57,12 +57,20 @@ text="In the heart of the bustling city, where towering skyscrapers touch the cl
|
|
57 |
With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
|
58 |
understanding of life in the universe grew ever stronger."
|
59 |
|
60 |
-
# chunk the text. The lower threshold is, the more chunks will be generated.
|
61 |
chunks=model.chunk_text(text, tokenizer, threshold=0)
|
62 |
|
63 |
# print chunks
|
64 |
for i, c in enumerate(chunks):
|
65 |
-
print(f'
|
66 |
print(c)
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
```
|
|
|
27 |
|
28 |
# load bert tokenizer
|
29 |
tokenizer = AutoTokenizer.from_pretrained(
|
30 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
31 |
padding_side="right",
|
32 |
model_max_length=255,
|
33 |
trust_remote_code=True,
|
|
|
35 |
|
36 |
# load MiniLM-L6-H384-uncased bert config
|
37 |
config = AutoConfig.from_pretrained(
|
38 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
39 |
trust_remote_code=True,
|
40 |
)
|
41 |
|
42 |
# initialize model
|
43 |
model = BertChunker(config)
|
44 |
+
device='cuda'
|
45 |
model.to(device)
|
46 |
|
47 |
# load parameters
|
48 |
+
state_dict = safetensors.torch.load_file("outputModels/checkpoint-3750/model.safetensors")
|
49 |
model.load_state_dict(state_dict)
|
50 |
|
51 |
# text to be chunked
|
|
|
57 |
With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
|
58 |
understanding of life in the universe grew ever stronger."
|
59 |
|
60 |
+
# chunk the text. The lower threshold is, the more chunks will be generated. Can be negative or positive.
|
61 |
chunks=model.chunk_text(text, tokenizer, threshold=0)
|
62 |
|
63 |
# print chunks
|
64 |
for i, c in enumerate(chunks):
|
65 |
+
print(f'-----chunk: {i}------------')
|
66 |
print(c)
|
67 |
|
68 |
+
print('----->Here is the result of fast chunk method<------:')
|
69 |
+
# chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
|
70 |
+
chunks=model.chunk_text_fast(text, tokenizer, batchsize=20, threshold=0)
|
71 |
+
|
72 |
+
# print chunks
|
73 |
+
for i, c in enumerate(chunks):
|
74 |
+
print(f'-----chunk: {i}------------')
|
75 |
+
print(c)
|
76 |
```
|