File size: 2,329 Bytes
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2b7e94
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2b7e94
 
01e655b
 
 
d2b7e94
 
 
 
 
 
 
 
 
 
 
 
01e655b
 
d2b7e94
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import string

from pypinyin.constants import SUPPORT_UCS4

# 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}

# 英文字符半角 -> 全角映射表
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}

# 数字字符全角 -> 半角映射表 (num: 10)
F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
# 数字字符半角 -> 全角映射表
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}

# 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
# 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}

# 空格 (num: 1)
F2H_SPACE = {"\u3000": " "}
H2F_SPACE = {" ": "\u3000"}

# 非"有拼音的汉字"的字符串,可用于NSW提取
if SUPPORT_UCS4:
    RE_NSW = re.compile(
        r"(?:[^"
        r"\u3007"  # 〇
        r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
        r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
        r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
        r"\U00020000-\U0002A6DF"  # CJK扩展B:[20000-2A6DF]
        r"\U0002A703-\U0002B73F"  # CJK扩展C:[2A700-2B73F]
        r"\U0002B740-\U0002B81D"  # CJK扩展D:[2B740-2B81D]
        r"\U0002F80A-\U0002FA1F"  # CJK兼容扩展:[2F800-2FA1F]
        r"])+"
    )
else:
    RE_NSW = re.compile(  # pragma: no cover
        r"(?:[^"
        r"\u3007"  # 〇
        r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
        r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
        r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
        r"])+"
    )