Imports and Login
import pandas as pd
import requests
import time
from funcy import chunks
OpenAlex is
An open and comprehensive catalog of scholarly papers, authors, institutions, and more.
Inspired by the ancient Library of Alexandria, OpenAlex is an index of hundreds of millions of interconnected entities across the global research system. We’re 100% free and open source, and offer access via a web interface,
It offers an API and snapshots of the full database.
According to their FAQ OpenAlex is trying to disambiguate authors:
Do you disambiguate authors?
Yes. Using coauthors, references, and other features of the data, we can tell that the same Jane Smith wrote both “Frog behavior” and “Frogs: A retrospective,” but it’s a different Jane Smith who wrote “Oats before boats: The breakfast customs of 17th-Century Dutch bargemen.”
This makes OpenAlex an interesting resource for disambiguated author names.
Given a DOI from our set of COVID19 related preprints we will try to get
import pandas as pd
import requests
import time
from funcy import chunks
= pd.read_csv(INPUT_FILE)
df = df.dropna(subset=["doi"])["doi"].values dois
= {"mailto":"meik.bittkowski@sciencemediacenter.de",
params "per-page":"50"}
= []
authorship_data = []
open_access_data = []
retraction_data
for count, doi_chunk in enumerate(chunks(50, dois), start=1):
if count % 10 == 0:
print(f"processed {count*50} / {len(dois)} ({count/len(dois)*50*100:.2f} %)")
try:
= '|'.join(map(lambda x: 'https://doi.org/'+x, doi_chunk))
doi_filter = requests.get(f"https://api.openalex.org/works?filter=doi:{doi_filter}", params=params)
r except BaseException as err:
print(f"Unexpected {err=}, {type(err)=}, {count=}, {doi_chunk=}")
else:
if r.ok:
print(r.json()["meta"])
= r.json()["results"]
works for work in works:
= pd.json_normalize(work, record_path="authorships")
authorships_df "doi"] = work["doi"]
authorships_df[
authorship_data.append(authorships_df)
= pd.json_normalize(work["open_access"])
open_access_df "doi"] = work["doi"]
open_access_df[
open_access_data.append(open_access_df)
= pd.DataFrame([{"doi":work["doi"], "is_retracted":work["is_retracted"]}])
is_retracted_df
retraction_data.append(is_retracted_df)
= pd.concat(authorship_data)
oa_authorships = pd.concat(open_access_data)
oa_open_access = pd.concat(retraction_data) oa_is_retracted
=None)
oa_authorships.to_csv(OUTPUT_FILE_AUTHORS, index=None)
oa_open_access.to_csv(OUTPUT_FILE_OPEN_ACCESS, index=None) oa_is_retracted.to_csv(OUTPUT_FILE_RETRACTIONS, index