Name: enable crossref API queries; document folders · carjed/audiences@e1f16c8 · GitHub
Rating: 4.6 (2757 reviews)
carjed
diff --git a/‎_docs/README.md
+1 b/‎_docs/README.md
+1
diff --git a/‎_docs/usage.md
+29 b/‎_docs/usage.md
+29
diff --git a/‎altmetric_to_doi.R
+19 b/‎altmetric_to_doi.R
+19
diff --git a/‎config.yaml
+1-1 b/‎config.yaml
+1-1
diff --git a/‎content/README.md
+1 b/‎content/README.md
+1
diff --git a/‎crossref_scrape.py
+59 b/‎crossref_scrape.py
+59
diff --git a/‎data/README.md
+1 b/‎data/README.md
+1
diff --git a/‎papers.txt
+30 b/‎papers.txt
+30
diff --git a/‎render_reports.R
+29-58 b/‎render_reports.R
+29-58
@@ -0,0 +1 @@
+This directory contains input files for building the site with mkdocs
@@ -0,0 +1,29 @@
+# Setup Twitter API
+
+To reproduce these analyses or run Audiences on your own paper(s), you will first need to set up a Twitter developer account for access to the Twitter API. (Documentation for setting up a Twitter dev account is available [here](https://rtweet.info/articles/auth.html)). Once completed, update the app name, consumer keys, and access keys in `config.yaml`.
+
+# Generate reports
+
+Running `render_reports.R` will generate a separate report for each of the papers listed in `papers.txt` by their Altmetric URLs (one per line). Reports are based on the `report_template.rmd` RMarkdown template.
+
+Reports will be written to `output/reports` and thumbnail images for each report to `output/figures`.
+
+# Build site
+
+```
+# generate data/items.toml, containing the links to reports to include in landing page
+python generate_links.py
+
+# build the landing page into _docs/static/ based on the hugrid Hugo template in themes/hugrid/
+# - requires config.toml and data/items.toml
+hugo
+
+# build with mkdocs into docs/
+# - requires mkdocs.yml and contents of _docs/
+mkdocs build -d docs 
+
+# copy the reports & thumbnails into docs/static/
+rsync -r output/ docs/static
+
+# push changes to github and the documentation will be available at https://carjed.github.io/audiences/
+```
@@ -0,0 +1,19 @@
+# helper script for translating Altmetric urls to DOIs
+
+article_urls<- read.csv("audiences/papers_altmetric.txt", col_names=F, sep="\t")
+
+dois<- c()
+for (article_full_urlinarticle_urls$X1){
+
+# scrape doi and abstract
+article_id<- gsub(".*.details/", "", article_full_url)
+
+summary_page<- read_html(article_full_url)
+doi<-summary_page %>% 
+ html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>% 
+ html_text()
+
+dois<- c(dois, doi[2])
+}
+
+write.table(dois, "audiences/papers.txt", quote=F, row.names=F, col.names=F)
@@ -1,5 +1,5 @@
 # example config file for Twitter API access
-
+email: "your email address"
 app_name: "name_of_twitter_app"
 consumer_key: "XXXXXXXXXXXXXXXXXX"
 consumer_secret: "XXXXXXXXXXXXXXXXXX"
 
@@ -0,0 +1 @@
+this directory exists for building the landing page with Hugo
@@ -0,0 +1,59 @@
+# old verison used to scrape data from Crossref Events API
+# deprecated in favor of the crevents R package to streamline
+# analysis within the R scripts https://github.com/ropensci/crevents
+
+importrequests
+importre
+importpandasaspd
+importyaml
+
+withopen("papers.txt") asf:
+dois=f.read().splitlines()
+
+# dois[1] = "10.1038/s41588-018-0147-3"
+
+fordoiindois:
+
+parameters= {"mailto": "NA",
+"obj-id": doi,
+"source": "twitter",
+"from-collected-date": "2012-01-01",
+"rows": 2000}
+
+response=requests.get("https://api.eventdata.crossref.org/v1/events", 
+params=parameters)
+tweets=response.json()
+
+tw_url="http://www.twitter.com/"
+
+df_cols= ["handles", "original", "tweets", "timestamps"]
+tweet_df=pd.DataFrame(columns=df_cols)
+fortweetintweets["message"]["events"]:
+account_url=tweet["subj"]["author"]["url"]
+account=re.sub("http://www.twitter.com/", "", account_url)
+account=re.sub("twitter://user\?screen_name=", "", account)
+# print(account)
+original_author=tweet["subj"]["original-tweet-author"]
+iforiginal_authorisnotNone:
+original_author=re.sub("http://www.twitter.com/", "", original_author)
+original_author=re.sub("twitter://user\?screen_name=", "", original_author)
+
+# print(account)
+
+url=tweet["subj"]["pid"]
+url=re.sub("twitter://status\?id=", 
+"http://www.twitter.com/"+account+"/statuses/", url)
+date=tweet["subj"]["issued"]
+
+# row_df = pd.DataFrame({"account": account, "original": original_author, "tweets": url, "timestamps": date})
+# print(row_df)
+tweet_df=tweet_df.append({"handles": account, 
+"original": original_author, 
+"tweets": url, 
+"timestamps": date},
+ignore_index=True)
+
+
+print("doi: "+doi, tweet_df.shape)
+
+tweet_df.to_csv("article_data/"+re.sub("/", "-", doi)+".txt", sep="\t", index=False)
@@ -0,0 +1 @@
+This directory contains the `items.toml` file, output by `generate_links.py` and necessary for building the landing page with Hugo.
@@ -0,0 +1,30 @@
+10.1016/j.cell.2018.02.031
+10.1101/489401
+10.1101/285734
+10.1101/493882
+10.1101/343087
+10.1038/s41586-018-0455-x
+10.1038/s41588-018-0147-3
+10.1101/106203
+10.1038/s41380-017-0005-1
+10.1101/457515
+10.1073/pnas.1612113114
+10.1101/509315
+10.1016/j.ajhg.2018.10.011
+10.1038/s41588-018-0313-7
+10.1038/s41586-019-0969-x
+10.1038/s41588-018-0273-y
+10.1101/556761
+10.1126/science.aau4832
+10.1016/j.cell.2018.09.008
+10.1126/science.aag0776
+10.1038/nature09103
+10.1126/science.1243518
+10.1126/science.aaa0114
+10.1101/250191
+10.1038/nature07331
+10.1126/science.aan8433
+10.1101/441261
+10.1101/416610
+10.1101/066431
+10.1038/s41467-018-05257-7
@@ -7,6 +7,7 @@ library(markdown)
 library(rmarkdown)
 library(rAltmetric)
 library(rvest)
+library(rcrossref)
 library(tidyverse)
 library(yaml)
 library(anytime)
@@ -118,74 +119,43 @@ if(file.exists(training_data_full_fh)){
 }
 
 #-----------------------------------------------------------------------------
-# Define list of articles
+# Read list of article DOIs and Altmetric URLs
+# - in the future, this will be purely DOI-based
+# - can also pull in list of popular bioRxiv papers using the Rxivist API
 #-----------------------------------------------------------------------------
-article_urls<- c(
-"https://cell.altmetric.com/details/34376150", # Browning et al (denisovan admixture) XX
-"https://biorxiv.altmetric.com/details/52355933", # Ragsdale et al (ghost admixture) XX
-"https://biorxiv.altmetric.com/details/34668368", # # Durvasula et al (ghost admixture) XX
-"https://biorxiv.altmetric.com/details/52608959", # Jensen et al (Stone Age chewing gum) XX
-"https://biorxiv.altmetric.com/details/43592322", # Villanea & Schraiber (ghost admixture) XX
-"https://www.altmetric.com/details/46833965", # Slon et al (Neanderthal/Denisovan offspring) XX
-"https://www.altmetric.com/details/45430386", # Lee et al (educational attainment) XX
-"https://biorxiv.altmetric.com/details/16179150", # Hill et al (family intelligence biorxiv version) XX
-"https://www.altmetric.com/details/31492953", # Hill et al (family intelligence Mol Psych version) XX
-"https://biorxiv.altmetric.com/details/50501527", # Abdellaoui et al (social strat~genetics) XX [nice UMAP structure; shows WN-affiliated psych/econ/philosophy/polisci]
-"https://pnas.altmetric.com/details/15551866", # Kong et al (selection for educational attainment vars) XX [nice UMAP structure; shows WN-affiliated psych/econ/philosophy/polisci]
-"https://biorxiv.altmetric.com/details/53351163", # MacLean et al (dog behavior) XX [example of non-human]
-"https://cell.altmetric.com/details/49945431", # ASHG statement XX [heavily polarized]
-"https://www.altmetric.com/details/53917059", # Lakhani et al (health insurance claims) XX
-"https://www.altmetric.com/details/55811639", # Cao et al (organogenesis) XX
-"https://www.altmetric.com/details/51402455", # Sherman et al (African pan genome) XX
-"https://biorxiv.altmetric.com/details/55925121", # Bridavsky et al (Lil Bub genome) XX
-"https://www.altmetric.com/details/49530141", # Erlich et al (DNA id) XX
-"https://cell.altmetric.com/details/49530208", # Kim et al (DNA id 2) XX
-"https://www.altmetric.com/details/12603108", # Field et al (human adaptation) XX
-"https://www.altmetric.com/details/443774", # Behar et al (Jewish genetics) XX [virtually no sci clusters]
-"https://www.altmetric.com/details/2118810", # Hellenthal et al (Global admixture) XX
-"https://www.altmetric.com/details/2858415", # Seguin-Orlando et al (Euro history) XX
-"https://biorxiv.altmetric.com/details/34262871", # Bycroft et al (Spanish pop struct) XX [good UMAP cosine structure]
-"https://www.altmetric.com/details/115659", # Novembre et al (genes mirror geography) XX
-"https://www.altmetric.com/details/27367975", # Crawford et al (skin pigmentation) XX # [suprisingly few wn clusters]
-"https://biorxiv.altmetric.com/details/49534330", # Martin et al (PRS risk) XX
-"https://biorxiv.altmetric.com/details/48265719", # Albers & McVean (dating variants) XX
-"https://biorxiv.altmetric.com/details/10104753", # Lawson et al (STRUCTURE tutorial biorxiv version) XX
-"https://www.altmetric.com/details/46498440"# Lawson et al (STRUCTURE tutorial Nat Comm version) XX
- ) 
+dois<- scan(paste0(datadir, "/papers.txt"), what="", sep="\n")
+# article_urls <- scan(paste0(datadir, "/papers_altmetric.txt"), what="", sep="\n")
 
 #-----------------------------------------------------------------------------
 # Generate reports
 #-----------------------------------------------------------------------------
-for (article_full_urlinarticle_urls){
+for (doiindois){
+# metadata <- cr_works(dois = doi)
 
-# scrape doi and abstract
-article_id<- gsub(".*.details/", "", article_full_url)
-
-summary_page<- read_html(article_full_url)
-doi<-summary_page %>% 
- html_nodes("div.document-details-table tr:nth-child(3) :nth-child(1)") %>% 
- html_text()
-
-article_doi<-doi[2]
+# altmetric metadata from API
+article_am<- altmetrics(doi=doi)
+article_df<- altmetric_data(article_am)
+article_id<-article_df$altmetric_id
 
-if(grepl("biorxiv", article_full_url)){
-biorxiv_page<- read_html(paste0("https://www.biorxiv.org/content/", article_doi, "v1"))
-abstract<-biorxiv_page %>% 
- html_nodes("div.abstract #p-2") %>% 
- html_text() 
+# get Altmetric URL, specifying journal-specific subdomain if needed
+if(grepl("10.1101", doi)){
+subdomain<-"biorxiv"
+} elseif(grepl("10.1016", doi)){
+subdomain<-"cell"
  } else {
-abstract1<-summary_page %>% 
- html_nodes("div.content-wrapper tr:nth-child(6) :nth-child(1)") %>% 
- html_text()
-abstract<- gsub("\n", "", abstract1[2])
+subdomain<-"www"
  }
 
-# altmetric metadata from API
-article_am<- altmetrics(doi=article_doi)
-article_df<- altmetric_data(article_am)
+article_full_url<- paste0("https://", subdomain, ".altmetric.com/details/", article_id)
+
+# get abstract if it's available in Crossref metadata
+abstract<- try(cr_abstract(doi))
+if(inherits(abstract, "try-error")){
+abstract<-""
+ }
 
 nb_file<- gsub("", "_", paste0(gsub(",.*", "", article_df$authors1), " et al-", 
- gsub("\"|/", "", article_df$title), "_", article_id, ".html"))
+gsub("\"|/", "", article_df$title), "_", article_id, ".html"))
 nb_title<- paste0("Twitter Audience Analysis of '", article_df$title, 
 "' by ", gsub(",.*", "", article_df$authors1), " et al., published in ",
 article_df$journal, " on ", anydate(as.integer(article_df$added_on)))
@@ -197,6 +167,7 @@ for (article_full_url in article_urls){
 output_dir= paste0(datadir, "/output/reports"),
 params=list(title=nb_title, 
 abstract=abstract,
-doi=article_doi))
+datadir=datadir,
+doi=doi))
 
-}
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+This directory contains input files for building the site with mkdocs`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+this directory exists for building the landing page with Hugo`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+This directory contains the `items.toml` file, output by `generate_links.py` and necessary for building the landing page with Hugo.