Skip to content

Commit e1f16c8

Browse files
committed
enable crossref API queries; document folders
1 parent 7b641b9 commit e1f16c8

10 files changed

+314
-137
lines changed

_docs/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This directory contains input files for building the site with mkdocs

_docs/usage.md

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Setup Twitter API
2+
3+
To reproduce these analyses or run Audiences on your own paper(s), you will first need to set up a Twitter developer account for access to the Twitter API. (Documentation for setting up a Twitter dev account is available [here](https://rtweet.info/articles/auth.html)). Once completed, update the app name, consumer keys, and access keys in `config.yaml`.
4+
5+
# Generate reports
6+
7+
Running `render_reports.R` will generate a separate report for each of the papers listed in `papers.txt` by their Altmetric URLs (one per line). Reports are based on the `report_template.rmd` RMarkdown template.
8+
9+
Reports will be written to `output/reports` and thumbnail images for each report to `output/figures`.
10+
11+
# Build site
12+
13+
```
14+
# generate data/items.toml, containing the links to reports to include in landing page
15+
python generate_links.py
16+
17+
# build the landing page into _docs/static/ based on the hugrid Hugo template in themes/hugrid/
18+
# - requires config.toml and data/items.toml
19+
hugo
20+
21+
# build with mkdocs into docs/
22+
# - requires mkdocs.yml and contents of _docs/
23+
mkdocs build -d docs
24+
25+
# copy the reports & thumbnails into docs/static/
26+
rsync -r output/ docs/static
27+
28+
# push changes to github and the documentation will be available at https://carjed.github.io/audiences/
29+
```

altmetric_to_doi.R

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# helper script for translating Altmetric urls to DOIs
2+
3+
article_urls<- read.csv("audiences/papers_altmetric.txt", col_names=F, sep="\t")
4+
5+
dois<- c()
6+
for (article_full_urlinarticle_urls$X1){
7+
8+
# scrape doi and abstract
9+
article_id<- gsub(".*.details/", "", article_full_url)
10+
11+
summary_page<- read_html(article_full_url)
12+
doi<-summary_page %>%
13+
html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>%
14+
html_text()
15+
16+
dois<- c(dois, doi[2])
17+
}
18+
19+
write.table(dois, "audiences/papers.txt", quote=F, row.names=F, col.names=F)

config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# example config file for Twitter API access
2-
2+
email: "your email address"
33
app_name: "name_of_twitter_app"
44
consumer_key: "XXXXXXXXXXXXXXXXXX"
55
consumer_secret: "XXXXXXXXXXXXXXXXXX"

content/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
this directory exists for building the landing page with Hugo

crossref_scrape.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# old verison used to scrape data from Crossref Events API
2+
# deprecated in favor of the crevents R package to streamline
3+
# analysis within the R scripts https://github.com/ropensci/crevents
4+
5+
importrequests
6+
importre
7+
importpandasaspd
8+
importyaml
9+
10+
withopen("papers.txt") asf:
11+
dois=f.read().splitlines()
12+
13+
# dois[1] = "10.1038/s41588-018-0147-3"
14+
15+
fordoiindois:
16+
17+
parameters= {"mailto": "NA",
18+
"obj-id": doi,
19+
"source": "twitter",
20+
"from-collected-date": "2012-01-01",
21+
"rows": 2000}
22+
23+
response=requests.get("https://api.eventdata.crossref.org/v1/events",
24+
params=parameters)
25+
tweets=response.json()
26+
27+
tw_url="http://www.twitter.com/"
28+
29+
df_cols= ["handles", "original", "tweets", "timestamps"]
30+
tweet_df=pd.DataFrame(columns=df_cols)
31+
fortweetintweets["message"]["events"]:
32+
account_url=tweet["subj"]["author"]["url"]
33+
account=re.sub("http://www.twitter.com/", "", account_url)
34+
account=re.sub("twitter://user\?screen_name=", "", account)
35+
# print(account)
36+
original_author=tweet["subj"]["original-tweet-author"]
37+
iforiginal_authorisnotNone:
38+
original_author=re.sub("http://www.twitter.com/", "", original_author)
39+
original_author=re.sub("twitter://user\?screen_name=", "", original_author)
40+
41+
# print(account)
42+
43+
url=tweet["subj"]["pid"]
44+
url=re.sub("twitter://status\?id=",
45+
"http://www.twitter.com/"+account+"/statuses/", url)
46+
date=tweet["subj"]["issued"]
47+
48+
# row_df = pd.DataFrame({"account": account, "original": original_author, "tweets": url, "timestamps": date})
49+
# print(row_df)
50+
tweet_df=tweet_df.append({"handles": account,
51+
"original": original_author,
52+
"tweets": url,
53+
"timestamps": date},
54+
ignore_index=True)
55+
56+
57+
print("doi: "+doi, tweet_df.shape)
58+
59+
tweet_df.to_csv("article_data/"+re.sub("/", "-", doi)+".txt", sep="\t", index=False)

data/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This directory contains the `items.toml` file, output by `generate_links.py` and necessary for building the landing page with Hugo.

papers.txt

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
10.1016/j.cell.2018.02.031
2+
10.1101/489401
3+
10.1101/285734
4+
10.1101/493882
5+
10.1101/343087
6+
10.1038/s41586-018-0455-x
7+
10.1038/s41588-018-0147-3
8+
10.1101/106203
9+
10.1038/s41380-017-0005-1
10+
10.1101/457515
11+
10.1073/pnas.1612113114
12+
10.1101/509315
13+
10.1016/j.ajhg.2018.10.011
14+
10.1038/s41588-018-0313-7
15+
10.1038/s41586-019-0969-x
16+
10.1038/s41588-018-0273-y
17+
10.1101/556761
18+
10.1126/science.aau4832
19+
10.1016/j.cell.2018.09.008
20+
10.1126/science.aag0776
21+
10.1038/nature09103
22+
10.1126/science.1243518
23+
10.1126/science.aaa0114
24+
10.1101/250191
25+
10.1038/nature07331
26+
10.1126/science.aan8433
27+
10.1101/441261
28+
10.1101/416610
29+
10.1101/066431
30+
10.1038/s41467-018-05257-7

render_reports.R

+29-58
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ library(markdown)
77
library(rmarkdown)
88
library(rAltmetric)
99
library(rvest)
10+
library(rcrossref)
1011
library(tidyverse)
1112
library(yaml)
1213
library(anytime)
@@ -118,74 +119,43 @@ if(file.exists(training_data_full_fh)){
118119
}
119120

120121
#-----------------------------------------------------------------------------
121-
# Define list of articles
122+
# Read list of article DOIs and Altmetric URLs
123+
# - in the future, this will be purely DOI-based
124+
# - can also pull in list of popular bioRxiv papers using the Rxivist API
122125
#-----------------------------------------------------------------------------
123-
article_urls<- c(
124-
"https://cell.altmetric.com/details/34376150", # Browning et al (denisovan admixture) XX
125-
"https://biorxiv.altmetric.com/details/52355933", # Ragsdale et al (ghost admixture) XX
126-
"https://biorxiv.altmetric.com/details/34668368", # # Durvasula et al (ghost admixture) XX
127-
"https://biorxiv.altmetric.com/details/52608959", # Jensen et al (Stone Age chewing gum) XX
128-
"https://biorxiv.altmetric.com/details/43592322", # Villanea & Schraiber (ghost admixture) XX
129-
"https://www.altmetric.com/details/46833965", # Slon et al (Neanderthal/Denisovan offspring) XX
130-
"https://www.altmetric.com/details/45430386", # Lee et al (educational attainment) XX
131-
"https://biorxiv.altmetric.com/details/16179150", # Hill et al (family intelligence biorxiv version) XX
132-
"https://www.altmetric.com/details/31492953", # Hill et al (family intelligence Mol Psych version) XX
133-
"https://biorxiv.altmetric.com/details/50501527", # Abdellaoui et al (social strat~genetics) XX [nice UMAP structure; shows WN-affiliated psych/econ/philosophy/polisci]
134-
"https://pnas.altmetric.com/details/15551866", # Kong et al (selection for educational attainment vars) XX [nice UMAP structure; shows WN-affiliated psych/econ/philosophy/polisci]
135-
"https://biorxiv.altmetric.com/details/53351163", # MacLean et al (dog behavior) XX [example of non-human]
136-
"https://cell.altmetric.com/details/49945431", # ASHG statement XX [heavily polarized]
137-
"https://www.altmetric.com/details/53917059", # Lakhani et al (health insurance claims) XX
138-
"https://www.altmetric.com/details/55811639", # Cao et al (organogenesis) XX
139-
"https://www.altmetric.com/details/51402455", # Sherman et al (African pan genome) XX
140-
"https://biorxiv.altmetric.com/details/55925121", # Bridavsky et al (Lil Bub genome) XX
141-
"https://www.altmetric.com/details/49530141", # Erlich et al (DNA id) XX
142-
"https://cell.altmetric.com/details/49530208", # Kim et al (DNA id 2) XX
143-
"https://www.altmetric.com/details/12603108", # Field et al (human adaptation) XX
144-
"https://www.altmetric.com/details/443774", # Behar et al (Jewish genetics) XX [virtually no sci clusters]
145-
"https://www.altmetric.com/details/2118810", # Hellenthal et al (Global admixture) XX
146-
"https://www.altmetric.com/details/2858415", # Seguin-Orlando et al (Euro history) XX
147-
"https://biorxiv.altmetric.com/details/34262871", # Bycroft et al (Spanish pop struct) XX [good UMAP cosine structure]
148-
"https://www.altmetric.com/details/115659", # Novembre et al (genes mirror geography) XX
149-
"https://www.altmetric.com/details/27367975", # Crawford et al (skin pigmentation) XX # [suprisingly few wn clusters]
150-
"https://biorxiv.altmetric.com/details/49534330", # Martin et al (PRS risk) XX
151-
"https://biorxiv.altmetric.com/details/48265719", # Albers & McVean (dating variants) XX
152-
"https://biorxiv.altmetric.com/details/10104753", # Lawson et al (STRUCTURE tutorial biorxiv version) XX
153-
"https://www.altmetric.com/details/46498440"# Lawson et al (STRUCTURE tutorial Nat Comm version) XX
154-
)
126+
dois<- scan(paste0(datadir, "/papers.txt"), what="", sep="\n")
127+
# article_urls <- scan(paste0(datadir, "/papers_altmetric.txt"), what="", sep="\n")
155128

156129
#-----------------------------------------------------------------------------
157130
# Generate reports
158131
#-----------------------------------------------------------------------------
159-
for (article_full_urlinarticle_urls){
132+
for (doiindois){
133+
# metadata <- cr_works(dois = doi)
160134

161-
# scrape doi and abstract
162-
article_id<- gsub(".*.details/", "", article_full_url)
163-
164-
summary_page<- read_html(article_full_url)
165-
doi<-summary_page %>%
166-
html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>%
167-
html_text()
168-
169-
article_doi<-doi[2]
135+
# altmetric metadata from API
136+
article_am<- altmetrics(doi=doi)
137+
article_df<- altmetric_data(article_am)
138+
article_id<-article_df$altmetric_id
170139

171-
if(grepl("biorxiv", article_full_url)){
172-
biorxiv_page<- read_html(paste0("https://www.biorxiv.org/content/", article_doi, "v1"))
173-
abstract<-biorxiv_page %>%
174-
html_nodes("div.abstract #p-2") %>%
175-
html_text()
140+
# get Altmetric URL, specifying journal-specific subdomain if needed
141+
if(grepl("10.1101", doi)){
142+
subdomain<-"biorxiv"
143+
} elseif(grepl("10.1016", doi)){
144+
subdomain<-"cell"
176145
} else {
177-
abstract1<-summary_page %>%
178-
html_nodes("div.content-wrapper tr:nth-child(6) :nth-child(1)") %>%
179-
html_text()
180-
abstract<- gsub("\n", "", abstract1[2])
146+
subdomain<-"www"
181147
}
182148

183-
# altmetric metadata from API
184-
article_am<- altmetrics(doi=article_doi)
185-
article_df<- altmetric_data(article_am)
149+
article_full_url<- paste0("https://", subdomain, ".altmetric.com/details/", article_id)
150+
151+
# get abstract if it's available in Crossref metadata
152+
abstract<- try(cr_abstract(doi))
153+
if(inherits(abstract, "try-error")){
154+
abstract<-""
155+
}
186156

187157
nb_file<- gsub("", "_", paste0(gsub(",.*", "", article_df$authors1), " et al-",
188-
gsub("\"|/", "", article_df$title), "_", article_id, ".html"))
158+
gsub("\"|/", "", article_df$title), "_", article_id, ".html"))
189159
nb_title<- paste0("Twitter Audience Analysis of '", article_df$title,
190160
"' by ", gsub(",.*", "", article_df$authors1), " et al., published in ",
191161
article_df$journal, " on ", anydate(as.integer(article_df$added_on)))
@@ -197,6 +167,7 @@ for (article_full_url in article_urls){
197167
output_dir= paste0(datadir, "/output/reports"),
198168
params=list(title=nb_title,
199169
abstract=abstract,
200-
doi=article_doi))
170+
datadir=datadir,
171+
doi=doi))
201172

202-
}
173+
}

0 commit comments

Comments
 (0)
close