Skip to content

Commit 1d5832c

Browse files
committed
Update 2022-12-28 for preprints posted up until 2022-12-25
1 parent f1d86e4 commit 1d5832c

14 files changed

+33
-18
lines changed

covid19_preprints.Rmd

+2-5
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ An inspection of the published dates of SSRN preprints indicates some abnormalit
236236
getSSRNPublicationDate <- function(doi) {
237237
238238
#in case requests time out (http error 429), use rate limiting
239-
Sys.sleep(2.5)
239+
Sys.sleep(10)
240240
241241
# Base URL for querying
242242
base_url <- "https://doi.org/"
@@ -292,8 +292,6 @@ The datasets derived from "posted-content" and from SSRN are merged to a final C
292292
293293
cr_covid <- bind_rows(cr_posted_content_covid, cr_ssrn_covid)
294294
295-
write_csv(cr_covid, "cr_covid.csv")
296-
297295
rm(cr_posted_content_covid, cr_ssrn_covid)
298296
299297
```
@@ -581,7 +579,6 @@ repec_covid <- tryCatch(
581579

582580
```{r message = FALSE, warning = FALSE, cache = TRUE}
583581
584-
585582
covid_preprints_update <- bind_rows(cr_covid,
586583
dc_covid,
587584
ar_covid,
@@ -709,7 +706,7 @@ palette <- c(pal_1, pal_2)
709706
```{r message = FALSE, warning = FALSE}
710707
711708
# Minimum number of preprints to be included in graphs (otherwise too many categories/labels is confusing. Aim for 19 servers to include.)
712-
n_min <- 150
709+
n_min <- 175
713710
714711
# Repositories with < min preprints
715712
other <- covid_preprints %>%

covid19_preprints.md

+26-8
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ Deduplicate final dataset to filter out records that are sampled twice.
4545

4646
```r
4747
sample_date_until<- Sys.Date()
48+
#sample_data_until <- as.Date("2022-11-03")
49+
4850
posted_date_until<-floor_date(sample_date_until, "week") #set to last Sunday prior to sample date
4951

5052
sample_date_from<- fromJSON("data/metadata.json") %>%
@@ -88,6 +90,7 @@ cr_expected_results <- cr_types(types = "posted-content",
8890
until_index_date= as.character(sample_date_until))
8991
)$meta$total_results
9092

93+
9194
# Query posted content
9295
cr_posted_content<- cr_types_(types="posted-content",
9396
works=TRUE,
@@ -254,7 +257,7 @@ directly crawling the SSRN website (using the
254257
getSSRNPublicationDate<-function(doi) {
255258

256259
#in case requests time out (http error 429), use rate limiting
257-
Sys.sleep(2.5)
260+
Sys.sleep(10)
258261

259262
# Base URL for querying
260263
base_url<-"https://doi.org/"
@@ -277,9 +280,23 @@ getSSRNPublicationDate <- function(doi) {
277280
}
278281

279282
# Create the final SSRN dataset. Deduplication of versions is done at a later stage.
283+
280284
cr_ssrn_covid<-cr_ssrn_df %>%
281285
# Filter COVID-19 related preprints. SSRN metadata does not contain abstracts
282-
filter(str_detect(title, regex(search_string, ignore_case=TRUE))) %>%
286+
filter(str_detect(title, regex(search_string, ignore_case=TRUE)))
287+
288+
#Filter on records not already in dataset (to reduce the number of records for which to crawl the SSRN website)
289+
290+
covid_preprints_previous<- read_csv("data/covid19_preprints.csv") %>%
291+
pull(identifier)
292+
293+
cr_ssrn_covid<-cr_ssrn_covid %>%
294+
filter(!identifier%in%covid_preprints_previous)
295+
296+
rm(covid_preprints_previous)
297+
298+
#crawl SSRN to retrieve posted dates
299+
cr_ssrn_covid<-cr_ssrn_covid %>%
283300
# Retrieve 'real' posted dates from the SSRN website. Warning: slow
284301
mutate(posted_date= ymd(map_chr(identifier, getSSRNPublicationDate)),
285302
source="SSRN") %>%
@@ -426,7 +443,7 @@ dc_returned_results <- dc_preprints %>%
426443
nrow()
427444

428445
rm(dc_types, dc_clients, dc_years,
429-
dc_types_cartesian, dc_clients_cartesian, dc_years_cartesian,
446+
dc_types_cartesian, dc_clients_cartesian, dc_years_cartesian,cartesian,
430447
dc_parameters)
431448
```
432449

@@ -569,10 +586,6 @@ parseRepecPreprints <- function(item) {
569586
start_date<-sample_date_from
570587
end_date<-sample_date_until
571588

572-
start_date<- as.Date("2021-11-01")
573-
end_date<- as.Date("2021-12-01")
574-
575-
576589
getRepecPreprints<-function(start_date, end_date) {
577590
d<-oai::list_records("http://oai.repec.org",
578591
from=start_date,
@@ -623,6 +636,9 @@ covid_preprints_update <- covid_preprints_update %>%
623636
abstract= str_squish(abstract)) %>%
624637
mutate(title= str_remove_all(title, "<.*?>"),
625638
title= str_squish(title))
639+
640+
641+
rm(cr_covid, dc_covid, ar_covid, repec_covid)
626642
```
627643

628644
\#Remove duplicate records (incl. versions) on same preprint server
@@ -672,6 +688,8 @@ covid_preprints <- covid_preprints %>%
672688

673689
covid_preprints %>%
674690
write_csv("data/covid19_preprints.csv")
691+
692+
rm(covid_preprints_previous, covid_preprints_update)
675693
```
676694

677695
# Create metadata file (json file with sample date and release date)
@@ -715,7 +733,7 @@ palette <- c(pal_1, pal_2)
715733

716734
```r
717735
# Minimum number of preprints to be included in graphs (otherwise too many categories/labels is confusing. Aim for 19 servers to include.)
718-
n_min<-150
736+
n_min<-175
719737

720738
# Repositories with < min preprints
721739
other<-covid_preprints %>%
Loading
Loading
Loading
Loading
Loading

data/covid19_preprints.csv

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:fadceec4407bc0bca5b06757a5b623185e01d2c7a0b71839ce6a7ea9889777a9
3-
size 128637152
2+
oid sha256:bfc031353e8abed3a9f7738bbb142446cf9e7df7c3dcdf7d309b6b4919439864
3+
size 130312287

data/metadata.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"release_date": "2022-12-01",
3-
"sample_date": "2022-11-30",
4-
"posted_date": "2022-11-27",
2+
"release_date": "2022-12-28",
3+
"sample_date": "2022-12-28",
4+
"posted_date": "2022-12-25",
55
"url": "https://github.com/nicholasmfraser/covid19_preprints/blob/master/data/covid19_preprints.csv?raw=true"
66
}
7.88 KB
Loading
Loading
2.29 KB
Loading
1.57 KB
Loading

0 commit comments

Comments
 (0)
close