Spaces:

sidphbot
/

Researcher

Build error

App Files Files Community

Researcher / arxiv_public_data /oai_metadata.py

sidphbot

spaces init

a8d4e3d over 2 years ago

raw

history blame contribute delete

8.95 kB

	"""
	oia_metadata.py

	authors: Matt Bierbaum and Colin Clement
	date: 2019-02-25

	This module interacts with the Open Archive Initiative API, downloading
	the metadata for all Arxiv articles.

	Usage
	=====

	python oia_metadata.py data/<savefile>.json

	Notes
	=====
	The save file is not technically JSON, but individual streamed lines of JSON,
	each of which is compressed by gzip. Use the helper function load_metadata
	to be sure to open it without error.

	Resources
	=========
	* http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm
	* https://arxiv.org/help/oa/index
	"""

	import os
	import gzip
	import glob
	import json
	import time
	import hashlib
	import datetime
	import requests
	import xml.etree.ElementTree as ET

	from arxiv_public_data.config import LOGGER, DIR_BASE

	log = LOGGER.getChild('metadata')

	URL_ARXIV_OAI = 'https://export.arxiv.org/oai2'
	URL_CITESEER_OAI = 'http://citeseerx.ist.psu.edu/oai2'
	OAI_XML_NAMESPACES = {
	'OAI': 'http://www.openarchives.org/OAI/2.0/',
	'arXiv': 'http://arxiv.org/OAI/arXivRaw/'
	}

	def get_list_record_chunk(resumptionToken=None, harvest_url=URL_ARXIV_OAI,
	metadataPrefix='arXivRaw'):
	"""
	Query OIA API for the metadata of 1000 Arxiv article

	Parameters
	----------
	resumptionToken : str
	Token for the API which triggers the next 1000 articles

	Returns
	-------
	record_chunks : str
	metadata of 1000 arXiv articles as an XML string
	"""
	parameters = {'verb': 'ListRecords'}

	if resumptionToken:
	parameters['resumptionToken'] = resumptionToken
	else:
	parameters['metadataPrefix'] = metadataPrefix

	response = requests.get(harvest_url, params=parameters)

	if response.status_code == 200:
	return response.text

	if response.status_code == 503:
	secs = int(response.headers.get('Retry-After', 20)) * 1.5
	log.info('Requested to wait, waiting {} seconds until retry...'.format(secs))

	time.sleep(secs)
	return get_list_record_chunk(resumptionToken=resumptionToken)
	else:
	raise Exception(
	'Unknown error in HTTP request {}, status code: {}'.format(
	response.url, response.status_code
	)
	)

	def _record_element_text(elm, name):
	""" XML helper function for extracting text from leaf (single-node) elements """
	item = elm.find('arXiv:{}'.format(name), OAI_XML_NAMESPACES)
	return item.text if item is not None else None

	def _record_element_all(elm, name):
	""" XML helper function for extracting text from queries with multiple nodes """
	return elm.findall('arXiv:{}'.format(name), OAI_XML_NAMESPACES)

	def parse_record(elm):
	"""
	Parse the XML element of a single ArXiv article into a dictionary of
	attributes

	Parameters
	----------
	elm : xml.etree.ElementTree.Element
	Element of the record of a single ArXiv article

	Returns
	-------
	output : dict
	Attributes of the ArXiv article stored as a dict with the keys
	id, submitter, authors, title, comments, journal-ref, doi, abstract,
	report-no, categories, and version
	"""
	text_keys = [
	'id', 'submitter', 'authors', 'title', 'comments',
	'journal-ref', 'doi', 'abstract', 'report-no'
	]
	output = {key: _record_element_text(elm, key) for key in text_keys}
	output['categories'] = [
	i.text for i in (_record_element_all(elm, 'categories') or [])
	]
	output['versions'] = [
	i.attrib['version'] for i in _record_element_all(elm, 'version')
	]
	return output

	def parse_xml_listrecords(root):
	"""
	Parse XML of one chunk of the metadata of 1000 ArXiv articles
	into a list of dictionaries

	Parameters
	----------
	root : xml.etree.ElementTree.Element
	Element containing the records of an entire chunk of ArXiv queries

	Returns
	-------
	records, resumptionToken : list, str
	records is a list of 1000 dictionaries, each containing the
	attributes of a single arxiv article
	resumptionToken is a string which is fed into the subsequent query
	"""
	resumptionToken = root.find(
	'OAI:ListRecords/OAI:resumptionToken',
	OAI_XML_NAMESPACES
	)
	resumptionToken = resumptionToken.text if resumptionToken is not None else ''

	records = root.findall(
	'OAI:ListRecords/OAI:record/OAI:metadata/arXiv:arXivRaw',
	OAI_XML_NAMESPACES
	)
	records = [parse_record(p) for p in records]

	return records, resumptionToken

	def check_xml_errors(root):
	""" Check for, log, and raise any OAI service errors in the XML """
	error = root.find('OAI:error', OAI_XML_NAMESPACES)

	if error is not None:
	raise RuntimeError(
	'OAI service returned error: {}'.format(error.text)
	)

	def find_default_locations():
	outfile = os.path.join(DIR_BASE, 'arxiv-metadata-oai-*.json.gz')
	resume = os.path.join(
	DIR_BASE, 'arxiv-metadata-oai-*.json.gz-resumptionToken.txt'
	)
	fn_outfile = sorted(glob.glob(outfile))
	fn_resume = sorted(glob.glob(resume))

	if len(fn_outfile) > 0:
	return fn_outfile[-1]
	return None

	def all_of_arxiv(outfile=None, resumptionToken=None, autoresume=True):
	"""
	Download the metadata for every article in the ArXiv via the OAI API

	Parameters
	----------
	outfile : str (default './arxiv-metadata-oai-<date>.json')
	name of file where data is stored, appending each chunk of 1000
	articles.
	resumptionToken : str (default None)
	token which instructs the OAI server to continue feeding the next
	chunk
	autoresume : bool
	If true, it looks for a saved resumptionToken in the file
	<outfile>-resumptionToken.txt
	"""
	date = str(datetime.datetime.now()).split(' ')[0]

	outfile = (
	outfile or # user-supplied
	find_default_locations() or # already in progress
	os.path.join(
	DIR_BASE, 'arxiv-metadata-oai-{}.json.gz'.format(date)
	) # new file
	)

	directory = os.path.split(outfile)[0]
	if directory and not os.path.exists(directory):
	os.makedirs(directory)
	tokenfile = '{}-resumptionToken.txt'.format(outfile)
	chunk_index = 0
	total_records = 0

	log.info('Saving metadata to "{}"'.format(outfile))

	resumptionToken = None
	if autoresume:
	try:
	resumptionToken = open(tokenfile, 'r').read()
	except Exception as e:
	log.warn("No tokenfile found '{}'".format(tokenfile))
	log.info("Starting download from scratch...")

	while True:
	log.info('Index {:4d} \| Records {:7d} \| resumptionToken "{}"'.format(
	chunk_index, total_records, resumptionToken)
	)
	xml_root = ET.fromstring(get_list_record_chunk(resumptionToken))
	check_xml_errors(xml_root)
	records, resumptionToken = parse_xml_listrecords(xml_root)

	chunk_index = chunk_index + 1
	total_records = total_records + len(records)

	with gzip.open(outfile, 'at', encoding='utf-8') as fout:
	for rec in records:
	fout.write(json.dumps(rec) + '\n')
	if resumptionToken:
	with open(tokenfile, 'w') as fout:
	fout.write(resumptionToken)
	else:
	log.info('No resumption token, query finished')
	return

	time.sleep(12) # OAI server usually requires a 10s wait

	def load_metadata(infile=None):
	"""
	Load metadata saved by all_of_arxiv, as a list of lines of gzip compressed
	json.

	Parameters
	----------
	infile : str or None
	name of file saved by gzip. If None, one is attempted to be found
	in the expected location with the expected name.

	Returns
	-------
	article_attributes : list
	list of dicts, each of which contains the metadata attributes of
	the ArXiv articles
	"""
	fname = infile or find_default_locations()
	with gzip.open(fname, 'rt', encoding='utf-8') as fin:
	return [json.loads(line) for line in fin.readlines()]

	def hash_abstracts(metadata):
	""" Replace abstracts with their MD5 hash for legal distribution """
	metadata_no_abstract = []
	for i in range(len(metadata)):
	m = metadata[i].copy()
	m['abstract_md5'] = hashlib.md5(m['abstract'].encode()).hexdigest()
	del m['abstract']
	metadata_no_abstract.append(m)
	return metadata_no_abstract

	def validate_abstract_hashes(metadata, metadata_no_abstract):
	""" Validate that abstracts match the hashes """
	for m, n in zip(metadata, metadata_no_abstract):
	md5 = hashlib.md5(m['abstract'].encode()).hexdigest()
	if not md5 == n['abstract_md5']:
	return False
	return True