Spaces:
Build error
Build error
File size: 6,516 Bytes
a8d4e3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
"""
regex_arxiv.py
author: Matt Bierbaum
date: 2019-03-14
RegEx patterns for finding arXiv id citations in fulltext articles.
"""
import re
# These are all the primary categories present in the OAI ArXiv metadata
CATEGORIES = [
"acc-phys", "adap-org", "alg-geom", "ao-sci", "astro-ph", "atom-ph",
"bayes-an", "chao-dyn", "chem-ph", "cmp-lg", "comp-gas", "cond-mat", "cs",
"dg-ga", "funct-an", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th",
"math", "math-ph", "mtrl-th", "nlin", "nucl-ex", "nucl-th", "patt-sol",
"physics", "plasm-ph", "q-alg", "q-bio", "quant-ph", "solv-int",
"supr-con", "eess", "econ", "q-fin", "stat"
]
# All subcategories with more than 2 capital letters (not SG, SI, SP, etc)
SUB_CATEGORIES = [
'acc-ph', 'ao-ph', 'app-ph', 'atm-clus', 'atom-ph', 'bio-ph', 'chem-ph',
'class-ph', 'comp-ph', 'data-an', 'dis-nn', 'ed-ph', 'flu-dyn', 'gen-ph',
'geo-ph', 'hist-ph', 'ins-det', 'med-ph', 'mes-hall', 'mtrl-sci', 'optics',
'other', 'plasm-ph', 'pop-ph', 'quant-gas', 'soc-ph', 'soft', 'space-ph',
'stat-mech', 'str-el', 'supr-con'
]
__all__ = (
'REGEX_ARXIV_SIMPLE',
'REGEX_ARXIV_STRICT',
'REGEX_ARXIV_FLEXIBLE'
)
dashdict = {c.replace('-', ''): c for c in CATEGORIES if '-' in c}
dashdict.update({c.replace('-', ''): c for c in SUB_CATEGORIES if '-' in c})
REGEX_VERSION_SPLITTER = re.compile(r'([vV][1-9]\d*)')
def strip_version(name):
""" 1501.21981v1 -> 1501.21981 """
return REGEX_VERSION_SPLITTER.split(name)[0]
def format_cat(name):
""" Strip subcategory, add hyphen to category name if missing """
if '/' in name: # OLD ID, names contains subcategory
catsubcat, aid = name.split('/')
cat = catsubcat.split('.')[0]
return dashdict.get(cat, cat) + "/" + aid
else:
return name
def zeropad_1501(name):
""" Arxiv IDs after yymm=1501 are padded to 5 zeros """
if not '/' in name: # new ID
yymm, num = name.split('.')
if int(yymm) > 1500 and len(num) < 5:
return yymm + ".0" + num
return name
def clean(name):
""" Correct common errors in ArXiv IDs to improve matching """
funcs = [strip_version, format_cat, zeropad_1501]
for func in funcs:
name = func(name)
return name
# A common typo is to exclude the hyphen in the category.
categories = list(set(CATEGORIES + [cat.replace('-', '') for cat in
CATEGORIES]))
subcategories = list(set(SUB_CATEGORIES + [cat.replace('-', '') for cat in
SUB_CATEGORIES]))
# capture possible minor categories
RE_CATEGORIES = r'(?:{})(?:(?:[.][A-Z]{{2}})|(?:{}))?'.format(
r'|'.join(categories), r'|'.join(subcategories)
)
# valid YYMM date, NOT preceded by any digits
# NOTE: at the date of writing, it is 2019, so we do not allow
# proper dates for YY 20 or larger
RE_DATE = r'(?:(?:[0-1][0-9])|(?:9[1-9]))(?:0[1-9]|1[0-2])'
RE_VERSION = r'(?:[vV][1-9]\d*)?'
# =============================================================================
RE_NUM_NEW = RE_DATE + r'(?:[.]\d{4,5})' + RE_VERSION
RE_NUM_OLD = RE_DATE + r'(?:\d{3})' + RE_VERSION
# matches: 1612.00001 1203.0023v2
RE_ID_NEW = r'(?:{})'.format(RE_NUM_NEW)
# matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2
RE_ID_OLD = r'(?:{}/{})'.format(RE_CATEGORIES, RE_NUM_OLD)
# =============================================================================
# matches: https://arxiv.org/abs/ abs/ arxiv.org/abs/
# 3. e-print: eprints
RE_PREFIX_URL = (
r'(?:'
r'(?i:http[s]?\://)?' # we could have a url prefix
r'(?i:arxiv\.org/)?' # maybe with the arxiv.org bit
r'(?i:abs/|pdf/)' # at least it has the abs/ part
r')'
)
# matches: arXiv: arxiv/ arxiv
RE_PREFIX_ARXIV = r'(?i:arxiv\s*[:/\s,.]*\s*)'
# matches: cs.AI/ cs.AI nucl-th
RE_PREFIX_CATEGORIES = r'(?i:{})'.format(RE_CATEGORIES)
# matches: e-prints: e-print eprints:
RE_PREFIX_EPRINT = r'(?i:e[-]?print[s]?.{1,3})'
# =============================================================================
# matches simple old or new identifiers, no fancy business
REGEX_ARXIV_SIMPLE = r'(?:{}|{})'.format(RE_ID_OLD, RE_ID_NEW)
# this one follows the guide set forth by:
# https://arxiv.org/help/arxiv_identifier
REGEX_ARXIV_STRICT = (
r'(?:{})'.format(RE_PREFIX_ARXIV) +
r'(?:'
r'({})'.format(RE_ID_OLD) +
r'|'
r'({})'.format(RE_ID_NEW) +
r')'
)
# this regex essentially accepts anything that looks like an arxiv id and has
# the slightest smell of being one as well. that is, if it is an id and
# mentions anything about the arxiv before hand, then it is an id.
REGEX_ARXIV_FLEXIBLE = (
r'(?:'
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
r')|(?:'
r'(?:'
r'(?:{})?'.format(RE_PREFIX_URL) +
r'(?:{})?'.format(RE_PREFIX_EPRINT) +
r'(?:'
r'(?:{})?'.format(RE_PREFIX_ARXIV) +
r'({})'.format(RE_ID_OLD) + # capture
r'|'
r'(?:{})'.format(RE_PREFIX_ARXIV) +
r'(?:{}/)?'.format(RE_CATEGORIES) +
r'({})'.format(RE_ID_NEW) + # capture
r')'
r')'
r'|'
r'(?:'
r'(?:{})|'.format(RE_PREFIX_URL) +
r'(?:{})|'.format(RE_PREFIX_EPRINT) +
r'(?:{})|'.format(RE_PREFIX_CATEGORIES) +
r'(?:{})'.format(RE_PREFIX_ARXIV) +
r')'
r'.*?'
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
r')|(?:'
r'(?:[\[\(]\s*)'
r'({})'.format(REGEX_ARXIV_SIMPLE) + # capture
r'(?:\s*[\]\)])'
r')'
)
TEST_POSITIVE = [
'arXiv:quant-ph 1503.01017v3',
'math. RT/0903.2992',
'arXiv, 1511.03262',
'tions. arXiv preprint arXiv:1607.00021, 2016',
'Math. Phys. 255, 577 (2005), hep-th/0306165',
'Kuzovlev, arXiv:cond-mat/9903350 ',
'arXiv:math.RT/1206.5933,',
'arXiv e-prints 1306.1595',
'ays, JHEP 07 (2009) 055, [ 0903.0883]',
' Rev. D71 (2005) 063534, [ astro-ph/0501562]',
'e-print arXiv:1506.02215v1',
'available at: http://arxiv.org/abs/1511.08977',
'arXiv e-print: 1306.2144',
'Preprint arXiv:math/0612139',
'Vertices in a Digraph. arXiv preprint 1602.02129 ',
'cond-mat/0309488.'
'decays, 1701.01871 LHCB-PAPE',
'Distribution. In: 1404.2485v3 (2015)',
'113005 (2013), 1307.4331,',
'scalar quantum 1610.07877v1',
'cond-mat/0309488.'
'cond-mat/0309488.8383'
]
TEST_NEGATIVE = [
'doi: 10.1145/ 321105.321114 ',
'doi: 10.1145/ 1105.321114 ',
]
|