Imports and Login
import pandas as pd
import requests
import os
from grobid_client.grobid_client import GrobidClient
from slugify import slugify
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
The preprint dataset queried from Dimensions contains references to the full texts of preprints if available.
To process fulltexts we first download all available PDF files in parallel. Then we us GROBID to parse those PDF files into a standardised XML representation, i.e. XML-TEI.
GROBID needs to be installed separately (see this guide for a Docker deployment). Configuration options for the GROBID-Client are stored in JSON file like the one at ../cfg/grobid_cfg.json
.
import pandas as pd
import requests
import os
from grobid_client.grobid_client import GrobidClient
from slugify import slugify
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
= pd.read_csv(INPUT_FILE_PREPRINTS) df
df.linkout.isna().value_counts()
False 31773
True 11912
Name: linkout, dtype: int64
# cf. https://opensourceoptions.com/blog/use-python-to-download-multiple-files-or-urls-in-parallel/
def download_url(args):
= args[0], args[1]
url, fn try:
= requests.get(url)
r = r.headers.get('content-type')
content_type
if 'application/pdf' in content_type:
with open(fn, 'wb') as f:
f.write(r.content)return f"{url} is PDF"
return url
except Exception as e:
print('Exception in download_url():', e)
def download_parallel(args):
= cpu_count()
cpus = ThreadPool(cpus - 1).imap_unordered(download_url, args)
results for url in results:
print('url:', url)
"REQUESTS_CA_BUNDLE"] = ""
os.environ["filename"] = df.doi.dropna().apply(lambda x: f"{OUTPUT_PATH_FULLTEXTS_RAW}/{slugify(x)}.pdf")
df[= list(df[["linkout", "filename"]].dropna().itertuples(index=False, name=None))
download_inputs = download_inputs[:TEST_N] if TEST else download_inputs
download_inputs
download_parallel(download_inputs)
# adds additional information about certificates if necessary
"REQUESTS_CA_BUNDLE"] = os.environ.get("REQUESTS_CA_BUNDLE_EXTRA", "")
os.environ[= GrobidClient(config_path=PATH_TO_GROBID_CONFIG)
client "processFulltextDocument", OUTPUT_PATH_FULLTEXTS_RAW, output=OUTPUT_PATH_FULLTEXTS_PARSED, n=20) client.process(