Name: Update 2022-12-28 for preprints posted up until 2022-12-25 · nicholasmfraser/covid19_preprints@1d5832c · GitHub
Rating: 4.5 (6328 reviews)
nicholasmfraser
diff --git a/‎covid19_preprints.Rmd
+2-5 b/‎covid19_preprints.Rmd
+2-5
diff --git a/‎covid19_preprints.md
+26-8 b/‎covid19_preprints.md
+26-8
diff --git a/‎covid19_preprints_files/figure-gfm/unnamed-chunk-20-1.png
-237 Bytes b/‎covid19_preprints_files/figure-gfm/unnamed-chunk-20-1.png
-237 Bytes
diff --git a/‎covid19_preprints_files/figure-gfm/unnamed-chunk-21-1.png
422 Bytes b/‎covid19_preprints_files/figure-gfm/unnamed-chunk-21-1.png
422 Bytes
diff --git a/‎covid19_preprints_files/figure-gfm/unnamed-chunk-22-1.png
187 Bytes b/‎covid19_preprints_files/figure-gfm/unnamed-chunk-22-1.png
187 Bytes
diff --git a/‎covid19_preprints_files/figure-gfm/unnamed-chunk-23-1.png
413 Bytes b/‎covid19_preprints_files/figure-gfm/unnamed-chunk-23-1.png
413 Bytes
diff --git a/‎covid19_preprints_files/figure-gfm/unnamed-chunk-24-1.png
140 Bytes b/‎covid19_preprints_files/figure-gfm/unnamed-chunk-24-1.png
140 Bytes
diff --git a/‎data/covid19_preprints.csv
+2-2 b/‎data/covid19_preprints.csv
+2-2
diff --git a/‎data/metadata.json
+3-3 b/‎data/metadata.json
+3-3
diff --git a/‎outputs/figures/covid19_preprints_day.png
7.88 KB b/‎outputs/figures/covid19_preprints_day.png
7.88 KB
diff --git a/‎outputs/figures/covid19_preprints_day_cumulative_by_month.png
-516 Bytes b/‎outputs/figures/covid19_preprints_day_cumulative_by_month.png
-516 Bytes
diff --git a/‎outputs/figures/covid19_preprints_day_cumulative_by_week.png
1.21 KB b/‎outputs/figures/covid19_preprints_day_cumulative_by_week.png
1.21 KB
diff --git a/‎outputs/figures/covid19_preprints_month.png
2.29 KB b/‎outputs/figures/covid19_preprints_month.png
2.29 KB
diff --git a/‎outputs/figures/covid19_preprints_week.png
1.57 KB b/‎outputs/figures/covid19_preprints_week.png
1.57 KB
@@ -236,7 +236,7 @@ An inspection of the published dates of SSRN preprints indicates some abnormalit
 getSSRNPublicationDate <- function(doi) {
 
  #in case requests time out (http error 429), use rate limiting
- Sys.sleep(2.5)
+ Sys.sleep(10)
 
  # Base URL for querying
  base_url <- "https://doi.org/"
@@ -292,8 +292,6 @@ The datasets derived from "posted-content" and from SSRN are merged to a final C
 
 cr_covid <- bind_rows(cr_posted_content_covid, cr_ssrn_covid)
 
-write_csv(cr_covid, "cr_covid.csv")
-
 rm(cr_posted_content_covid, cr_ssrn_covid)
 
 ```
@@ -581,7 +579,6 @@ repec_covid <- tryCatch(
 
 ```{r message = FALSE, warning = FALSE, cache = TRUE}
 
-
 covid_preprints_update <- bind_rows(cr_covid, 
  dc_covid, 
  ar_covid, 
@@ -709,7 +706,7 @@ palette <- c(pal_1, pal_2)
 ```{r message = FALSE, warning = FALSE}
 
 # Minimum number of preprints to be included in graphs (otherwise too many categories/labels is confusing. Aim for 19 servers to include.)
-n_min <- 150
+n_min <- 175
 
 # Repositories with < min preprints
 other <- covid_preprints %>%
 
@@ -45,6 +45,8 @@ Deduplicate final dataset to filter out records that are sampled twice.
 
 ```r
 sample_date_until<- Sys.Date()
+#sample_data_until <- as.Date("2022-11-03")
+
 posted_date_until<-floor_date(sample_date_until, "week") #set to last Sunday prior to sample date
 
 sample_date_from<- fromJSON("data/metadata.json") %>%
@@ -88,6 +90,7 @@ cr_expected_results <- cr_types(types = "posted-content",
 until_index_date= as.character(sample_date_until))
  )$meta$total_results
 
+
 # Query posted content
 cr_posted_content<- cr_types_(types="posted-content",
 works=TRUE, 
@@ -254,7 +257,7 @@ directly crawling the SSRN website (using the
 getSSRNPublicationDate<-function(doi) {
 
 #in case requests time out (http error 429), use rate limiting
- Sys.sleep(2.5)
+ Sys.sleep(10)
 
 # Base URL for querying
 base_url<-"https://doi.org/"
@@ -277,9 +280,23 @@ getSSRNPublicationDate <- function(doi) {
 }
 
 # Create the final SSRN dataset. Deduplication of versions is done at a later stage.
+
 cr_ssrn_covid<-cr_ssrn_df %>% 
 # Filter COVID-19 related preprints. SSRN metadata does not contain abstracts
- filter(str_detect(title, regex(search_string, ignore_case=TRUE))) %>%
+ filter(str_detect(title, regex(search_string, ignore_case=TRUE))) 
+
+#Filter on records not already in dataset (to reduce the number of records for which to crawl the SSRN website)
+
+covid_preprints_previous<- read_csv("data/covid19_preprints.csv") %>%
+ pull(identifier)
+
+cr_ssrn_covid<-cr_ssrn_covid %>%
+ filter(!identifier%in%covid_preprints_previous)
+
+rm(covid_preprints_previous)
+
+#crawl SSRN to retrieve posted dates
+cr_ssrn_covid<-cr_ssrn_covid %>%
 # Retrieve 'real' posted dates from the SSRN website. Warning: slow
  mutate(posted_date= ymd(map_chr(identifier, getSSRNPublicationDate)),
 source="SSRN") %>%
@@ -426,7 +443,7 @@ dc_returned_results <- dc_preprints %>%
  nrow()
 
 rm(dc_types, dc_clients, dc_years,
-dc_types_cartesian, dc_clients_cartesian, dc_years_cartesian,
+dc_types_cartesian, dc_clients_cartesian, dc_years_cartesian,cartesian,
 dc_parameters)
 ```
 
@@ -569,10 +586,6 @@ parseRepecPreprints <- function(item) {
 start_date<-sample_date_from
 end_date<-sample_date_until
 
-start_date<- as.Date("2021-11-01")
-end_date<- as.Date("2021-12-01")
-
-
 getRepecPreprints<-function(start_date, end_date) {
 d<-oai::list_records("http://oai.repec.org", 
 from=start_date, 
@@ -623,6 +636,9 @@ covid_preprints_update <- covid_preprints_update %>%
 abstract= str_squish(abstract)) %>%
  mutate(title= str_remove_all(title, "<.*?>"),
 title= str_squish(title))
+
+
+rm(cr_covid, dc_covid, ar_covid, repec_covid) 
 ```
 
 \#Remove duplicate records (incl. versions) on same preprint server
@@ -672,6 +688,8 @@ covid_preprints <- covid_preprints %>%
 
 covid_preprints %>% 
  write_csv("data/covid19_preprints.csv")
+
+rm(covid_preprints_previous, covid_preprints_update)
 ```
 
 # Create metadata file (json file with sample date and release date)
@@ -715,7 +733,7 @@ palette <- c(pal_1, pal_2)
 
 ```r
 # Minimum number of preprints to be included in graphs (otherwise too many categories/labels is confusing. Aim for 19 servers to include.)
-n_min<-150
+n_min<-175
 
 # Repositories with < min preprints
 other<-covid_preprints %>%
 
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fadceec4407bc0bca5b06757a5b623185e01d2c7a0b71839ce6a7ea9889777a9
-size 128637152
+oid sha256:bfc031353e8abed3a9f7738bbb142446cf9e7df7c3dcdf7d309b6b4919439864
+size 130312287
@@ -1,6 +1,6 @@
 {
-"release_date": "2022-12-01",
-"sample_date": "2022-11-30",
-"posted_date": "2022-11-27",
+"release_date": "2022-12-28",
+"sample_date": "2022-12-28",
+"posted_date": "2022-12-25",
 "url": "https://github.com/nicholasmfraser/covid19_preprints/blob/master/data/covid19_preprints.csv?raw=true"
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`		`-"release_date": "2022-12-01",`
`3`		`-"sample_date": "2022-11-30",`
`4`		`-"posted_date": "2022-11-27",`
	`2`	`+"release_date": "2022-12-28",`
	`3`	`+"sample_date": "2022-12-28",`
	`4`	`+"posted_date": "2022-12-25",`
`5`	`5`	`"url": "https://github.com/nicholasmfraser/covid19_preprints/blob/master/data/covid19_preprints.csv?raw=true"`
`6`	`6`	`}`