Imports and Login
import dimcli
import pandas as pd
from clj import mapcat
from funcy import chunks
import json
dimcli.login()= dimcli.Dsl() dsl
Adds information from different categorial metadata sources, like research categories and classification schemes:
Cf. this page for short descriptions of all categorisation systems listed above.
import dimcli
import pandas as pd
from clj import mapcat
from funcy import chunks
import json
dimcli.login()= dimcli.Dsl() dsl
= pd.read_csv(INPUT_FILE)
preprints_df = list(preprints_df.dropna(subset=["doi"]).doi.values) dois
= 300
CHUNK_SIZE = []
data
= "search publications where doi in {} return publications [id + doi + title + year + journal + categories]"
q = q + f" limit {CHUNK_SIZE*2}"
q
for dois_chunk in chunks(CHUNK_SIZE, dois):
= dsl.query(
results format(json.dumps(dois_chunk)))
q.
data.append(results)
= pd.DataFrame(mapcat(lambda x: x["publications"], data)) categories_df
categories_df.head()
category_bra | category_for | category_hra | category_hrcs_rac | category_rcdc | category_sdg | category_uoa | doi | id | journal | title | year | category_hrcs_hc | category_icrp_cso | category_icrp_ct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [{'id': '4001', 'name': 'Clinical Medicine and... | [{'id': '80003', 'name': '32 Biomedical and Cl... | [{'id': '3901', 'name': 'Clinical'}] | [{'id': '10501', 'name': '5.1 Pharmaceuticals'}] | [{'id': '533', 'name': 'Infectious Diseases'},... | [{'id': '40003', 'name': '3 Good Health and We... | [{'id': '30001', 'name': 'A01 Clinical Medicin... | 10.21203/rs.3.rs-137853/v1 | pub.1139066291 | {'id': 'jour.1380788', 'title': 'Research Squa... | The kidnapping of mitochondrial function assoc... | 2020 | NaN | NaN | NaN |
1 | [{'id': '4000', 'name': 'Basic Science'}] | [{'id': '80002', 'name': '31 Biological Scienc... | [{'id': '3900', 'name': 'Biomedical'}] | [{'id': '10202', 'name': '2.2 Factors relating... | [{'id': '558', 'name': 'Prevention'}, {'id': '... | [{'id': '40003', 'name': '3 Good Health and We... | [{'id': '30005', 'name': 'A05 Biological Scien... | 10.14293/s2199-1006.1.sor-.ppdo2zu.v1 | pub.1134469293 | {'id': 'jour.1381049', 'title': 'ScienceOpen P... | Hypothesis of Potential Evolution of SARS-CoV-... | 2020 | [{'id': '898', 'name': 'Infection'}] | NaN | NaN |
2 | NaN | [{'id': '80138', 'name': '4203 Health Services... | [{'id': '3903', 'name': 'Population & Society'}] | NaN | [{'id': '546', 'name': 'Patient Safety'}, {'id... | NaN | [{'id': '30002', 'name': 'A02 Public Health, H... | 10.48550/arxiv.2101.00060 | pub.1134363910 | {'id': 'jour.1371339', 'title': 'arXiv'} | Networks of Necessity: Simulating COVID-19 Mit... | 2020 | [{'id': '898', 'name': 'Infection'}] | NaN | NaN |
3 | [{'id': '4003', 'name': 'Public Health'}] | [{'id': '80017', 'name': '46 Information and C... | [{'id': '3903', 'name': 'Population & Society'}] | NaN | [{'id': '380', 'name': 'Mental Health'}, {'id'... | [{'id': '40003', 'name': '3 Good Health and We... | [{'id': '30003', 'name': 'A03 Allied Health Pr... | 10.2196/preprints.26876 | pub.1134331013 | {'id': 'jour.1345647', 'title': 'JMIR Preprints'} | Twitter Users Display Desensitization to Bad H... | 2020 | [{'id': '905', 'name': 'Mental health'}] | NaN | NaN |
4 | NaN | [{'id': '80138', 'name': '4203 Health Services... | [{'id': '3902', 'name': 'Health services & sys... | [{'id': '10304', 'name': '3.4 Vaccines'}] | [{'id': '531', 'name': 'Immunization'}, {'id':... | [{'id': '40003', 'name': '3 Good Health and We... | [{'id': '30003', 'name': 'A03 Allied Health Pr... | 10.2196/preprints.26874 | pub.1134331012 | {'id': 'jour.1345647', 'title': 'JMIR Preprints'} | COVID-19 Vaccine Hesitancy in Canada: Content ... | 2020 | NaN | NaN | NaN |
… one category per row
= ["category_"+x for x in ["bra", "for", "hra", "rcdc", "uoa", "hrcs_hc", "hrcs_rac", "sdg", "icrp_ct", "icrp_cso"]]
category_cols = categories_df.filter(["id", "doi"] + category_cols).\
exploded_categories_df =["id", "doi"], value_vars=category_cols, var_name="category_label").\
melt(id_vars="value").\
explode(column=["value"]).\
dropna(subset=lambda df: df.value.apply(lambda v: v["id"]),
assign(category_id=lambda df: df.value.apply(lambda v: v["name"])).\
category_name="value")
drop(columns exploded_categories_df.head()
id | doi | category_label | category_id | category_name | |
---|---|---|---|---|---|
0 | pub.1139066291 | 10.21203/rs.3.rs-137853/v1 | category_bra | 4001 | Clinical Medicine and Science |
1 | pub.1134469293 | 10.14293/s2199-1006.1.sor-.ppdo2zu.v1 | category_bra | 4000 | Basic Science |
3 | pub.1134331013 | 10.2196/preprints.26876 | category_bra | 4003 | Public Health |
7 | pub.1134300967 | 10.20944/preprints202012.0780.v1 | category_bra | 4003 | Public Health |
15 | pub.1134263985 | 10.31234/osf.io/eb6yz | category_bra | 4003 | Public Health |
=None) exploded_categories_df.to_csv(OUTPUT_FILE, index