|
from langchain.text_splitter import CharacterTextSplitter |
|
import re |
|
from typing import List |
|
|
|
|
|
class AliTextSplitter(CharacterTextSplitter): |
|
def __init__(self, pdf: bool = False, **kwargs): |
|
super().__init__(**kwargs) |
|
self.pdf = pdf |
|
|
|
def split_text(self, text: str) -> List[str]: |
|
|
|
|
|
|
|
if self.pdf: |
|
text = re.sub(r"\n{3,}", r"\n", text) |
|
text = re.sub('\s', " ", text) |
|
text = re.sub("\n\n", "", text) |
|
from modelscope.pipelines import pipeline |
|
|
|
p = pipeline( |
|
task="document-segmentation", |
|
model='damo/nlp_bert_document-segmentation_chinese-base', |
|
device="cpu") |
|
result = p(documents=text) |
|
sent_list = [i for i in result["text"].split("\n\t") if i] |
|
return sent_list |
|
|