first commit
Browse files- corpus.txt +5 -0
- main.py +105 -0
corpus.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
我爱自然语言处理
|
2 |
+
雷老师永远的神
|
3 |
+
我爱雷老师
|
4 |
+
自然语言不爱我啊
|
5 |
+
你是我的NLP
|
main.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 导入所需的库
|
2 |
+
import random
|
3 |
+
import jieba
|
4 |
+
import collections
|
5 |
+
|
6 |
+
# 定义n-gram模型的类
|
7 |
+
class NGramModel:
|
8 |
+
|
9 |
+
# 初始化模型,设定n的值和平滑参数
|
10 |
+
def __init__(self, n, alpha):
|
11 |
+
self.n = n
|
12 |
+
self.alpha = alpha
|
13 |
+
self.ngrams = collections.defaultdict(int) # 存储n-gram的计数
|
14 |
+
self.contexts = collections.defaultdict(int) # 存储n-1-gram的计数
|
15 |
+
self.vocabulary = set() # 存储词汇表
|
16 |
+
|
17 |
+
# 根据语料库训练模型,更新n-gram和n-1-gram的计数和词汇表
|
18 |
+
def train(self, corpus):
|
19 |
+
for sentence in corpus:
|
20 |
+
# 在句首和句尾添加开始和结束标记
|
21 |
+
sentence = ["<s>"] * (self.n - 1) + sentence + ["</s>"]
|
22 |
+
# 对每个n-gram进行计数
|
23 |
+
for i in range(len(sentence) - self.n + 1):
|
24 |
+
ngram = tuple(sentence[i:i+self.n])
|
25 |
+
self.ngrams[ngram] += 1
|
26 |
+
# 对每个n-1-gram进行计数
|
27 |
+
context = tuple(sentence[i:i+self.n-1])
|
28 |
+
self.contexts[context] += 1
|
29 |
+
# 更新词汇表
|
30 |
+
self.vocabulary.update(ngram)
|
31 |
+
|
32 |
+
# 根据n-1-gram的上下文预测下一个词的概率分布,使用加法平滑
|
33 |
+
def predict(self, context):
|
34 |
+
# 初始化概率分布字典
|
35 |
+
probabilities = {}
|
36 |
+
# 遍历词汇表中的每个词
|
37 |
+
for word in self.vocabulary:
|
38 |
+
# 构造n-gram
|
39 |
+
ngram = tuple(context) + (word,)
|
40 |
+
# 计算n-gram的概率,使用加法平滑
|
41 |
+
probability = (self.ngrams[ngram] + self.alpha) / (self.contexts[tuple(context)] + self.alpha * len(self.vocabulary))
|
42 |
+
# 将概率存入字典
|
43 |
+
probabilities[word] = probability
|
44 |
+
# 返回概率分布字典
|
45 |
+
return probabilities
|
46 |
+
|
47 |
+
# 根据概率分布字典随机选择一个词,使用轮盘赌算法
|
48 |
+
def sample(self, probabilities):
|
49 |
+
# 计算概率分布的总和
|
50 |
+
total = sum(probabilities.values())
|
51 |
+
# 生成一个0到总和之间的随机数
|
52 |
+
random_number = random.uniform(0, total)
|
53 |
+
# 初始化累积概率
|
54 |
+
cumulative_probability = 0.0
|
55 |
+
# 遍历概率分布字典中的每个词和概率
|
56 |
+
for word, probability in probabilities.items():
|
57 |
+
# 累加概率
|
58 |
+
cumulative_probability += probability
|
59 |
+
# 如果累积概率大于等于随机数,返回该词
|
60 |
+
if cumulative_probability >= random_number:
|
61 |
+
return word
|
62 |
+
|
63 |
+
# 根据模型生成一句话,给定一个初始的上下文
|
64 |
+
def generate(self, context):
|
65 |
+
# 初始化生成的句子列表,包含初始的上下文
|
66 |
+
sentence = list(context)
|
67 |
+
# 循环生成下一个词,直到遇到结束标记或达到最大长度
|
68 |
+
while True:
|
69 |
+
# 预测下一个词的概率分布
|
70 |
+
probabilities = self.predict(context)
|
71 |
+
# 随机选择一个词
|
72 |
+
word = self.sample(probabilities)
|
73 |
+
# 将词添加到句子列表
|
74 |
+
sentence.append(word)
|
75 |
+
# 如果词是结束标记,跳出循环
|
76 |
+
if word == "</s>":
|
77 |
+
break
|
78 |
+
# 更新上下文,去掉第一个词,加上最后一个词
|
79 |
+
context = context[1:] + (word,)
|
80 |
+
# 返回生成的句子列表,去掉开始和结束标记
|
81 |
+
return sentence[self.n-1:-1]
|
82 |
+
|
83 |
+
# 读取语料库文件,分词并存入列表
|
84 |
+
corpus = []
|
85 |
+
with open("corpus.txt", encoding="utf-8") as f:
|
86 |
+
for line in f:
|
87 |
+
line = line.strip()
|
88 |
+
if line:
|
89 |
+
words = list(jieba.cut(line))
|
90 |
+
corpus.append(words)
|
91 |
+
print("语料库中的句子数:", len(corpus))
|
92 |
+
print(corpus)
|
93 |
+
# 创建一个3-gram模型,平滑参数设为0.01
|
94 |
+
model = NGramModel(3, 0.01)
|
95 |
+
# 根据语料库训练模型
|
96 |
+
model.train(corpus)
|
97 |
+
print("词汇表中的词数:", len(model.vocabulary))
|
98 |
+
print("n-1-gram的计数:", model.contexts.items())
|
99 |
+
|
100 |
+
# 生成一句话,初始上下文设为("我", "爱")
|
101 |
+
sentence = model.generate(("我", "爱"))
|
102 |
+
# 将生成的句子列表拼接成字符串并打印
|
103 |
+
print("".join(sentence))
|
104 |
+
sentence = model.generate(("我",))
|
105 |
+
print("".join(sentence))
|