tim1900 commited on
Commit
7964b25
1 Parent(s): 9657071

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -6
README.md CHANGED
@@ -27,7 +27,7 @@ from modeling_bertchunker import BertChunker
27
 
28
  # load bert tokenizer
29
  tokenizer = AutoTokenizer.from_pretrained(
30
- "./",
31
  padding_side="right",
32
  model_max_length=255,
33
  trust_remote_code=True,
@@ -35,17 +35,17 @@ tokenizer = AutoTokenizer.from_pretrained(
35
 
36
  # load MiniLM-L6-H384-uncased bert config
37
  config = AutoConfig.from_pretrained(
38
- "./",
39
  trust_remote_code=True,
40
  )
41
 
42
  # initialize model
43
  model = BertChunker(config)
44
- device='cpu'
45
  model.to(device)
46
 
47
  # load parameters
48
- state_dict = safetensors.torch.load_file("./model.safetensors")
49
  model.load_state_dict(state_dict)
50
 
51
  # text to be chunked
@@ -57,12 +57,20 @@ text="In the heart of the bustling city, where towering skyscrapers touch the cl
57
  With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
58
  understanding of life in the universe grew ever stronger."
59
 
60
- # chunk the text. The lower threshold is, the more chunks will be generated.
61
  chunks=model.chunk_text(text, tokenizer, threshold=0)
62
 
63
  # print chunks
64
  for i, c in enumerate(chunks):
65
- print(f'------------------')
66
  print(c)
67
 
 
 
 
 
 
 
 
 
68
  ```
 
27
 
28
  # load bert tokenizer
29
  tokenizer = AutoTokenizer.from_pretrained(
30
+ "sentence-transformers/all-MiniLM-L6-v2",
31
  padding_side="right",
32
  model_max_length=255,
33
  trust_remote_code=True,
 
35
 
36
  # load MiniLM-L6-H384-uncased bert config
37
  config = AutoConfig.from_pretrained(
38
+ "sentence-transformers/all-MiniLM-L6-v2",
39
  trust_remote_code=True,
40
  )
41
 
42
  # initialize model
43
  model = BertChunker(config)
44
+ device='cuda'
45
  model.to(device)
46
 
47
  # load parameters
48
+ state_dict = safetensors.torch.load_file("outputModels/checkpoint-3750/model.safetensors")
49
  model.load_state_dict(state_dict)
50
 
51
  # text to be chunked
 
57
  With each passing light year, the anticipation of unraveling secrets that could alter humanity's\
58
  understanding of life in the universe grew ever stronger."
59
 
60
+ # chunk the text. The lower threshold is, the more chunks will be generated. Can be negative or positive.
61
  chunks=model.chunk_text(text, tokenizer, threshold=0)
62
 
63
  # print chunks
64
  for i, c in enumerate(chunks):
65
+ print(f'-----chunk: {i}------------')
66
  print(c)
67
 
68
+ print('----->Here is the result of fast chunk method<------:')
69
+ # chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
70
+ chunks=model.chunk_text_fast(text, tokenizer, batchsize=20, threshold=0)
71
+
72
+ # print chunks
73
+ for i, c in enumerate(chunks):
74
+ print(f'-----chunk: {i}------------')
75
+ print(c)
76
  ```