@@ -45,6 +45,8 @@ Deduplicate final dataset to filter out records that are sampled twice.
45
45
46
46
``` r
47
47
sample_date_until <- Sys.Date()
48
+ # sample_data_until <- as.Date("2022-11-03")
49
+
48
50
posted_date_until <- floor_date(sample_date_until , " week" ) # set to last Sunday prior to sample date
49
51
50
52
sample_date_from <- fromJSON(" data/metadata.json" ) %> %
@@ -88,6 +90,7 @@ cr_expected_results <- cr_types(types = "posted-content",
88
90
until_index_date = as.character(sample_date_until ))
89
91
)$ meta $ total_results
90
92
93
+
91
94
# Query posted content
92
95
cr_posted_content <- cr_types_(types = " posted-content" ,
93
96
works = TRUE ,
@@ -254,7 +257,7 @@ directly crawling the SSRN website (using the
254
257
getSSRNPublicationDate <- function (doi ) {
255
258
256
259
# in case requests time out (http error 429), use rate limiting
257
- Sys.sleep(2.5 )
260
+ Sys.sleep(10 )
258
261
259
262
# Base URL for querying
260
263
base_url <- " https://doi.org/"
@@ -277,9 +280,23 @@ getSSRNPublicationDate <- function(doi) {
277
280
}
278
281
279
282
# Create the final SSRN dataset. Deduplication of versions is done at a later stage.
283
+
280
284
cr_ssrn_covid <- cr_ssrn_df %> %
281
285
# Filter COVID-19 related preprints. SSRN metadata does not contain abstracts
282
- filter(str_detect(title , regex(search_string , ignore_case = TRUE ))) %> %
286
+ filter(str_detect(title , regex(search_string , ignore_case = TRUE )))
287
+
288
+ # Filter on records not already in dataset (to reduce the number of records for which to crawl the SSRN website)
289
+
290
+ covid_preprints_previous <- read_csv(" data/covid19_preprints.csv" ) %> %
291
+ pull(identifier )
292
+
293
+ cr_ssrn_covid <- cr_ssrn_covid %> %
294
+ filter(! identifier %in% covid_preprints_previous )
295
+
296
+ rm(covid_preprints_previous )
297
+
298
+ # crawl SSRN to retrieve posted dates
299
+ cr_ssrn_covid <- cr_ssrn_covid %> %
283
300
# Retrieve 'real' posted dates from the SSRN website. Warning: slow
284
301
mutate(posted_date = ymd(map_chr(identifier , getSSRNPublicationDate )),
285
302
source = " SSRN" ) %> %
@@ -426,7 +443,7 @@ dc_returned_results <- dc_preprints %>%
426
443
nrow()
427
444
428
445
rm(dc_types , dc_clients , dc_years ,
429
- dc_types_cartesian , dc_clients_cartesian , dc_years_cartesian ,
446
+ dc_types_cartesian , dc_clients_cartesian , dc_years_cartesian ,cartesian ,
430
447
dc_parameters )
431
448
```
432
449
@@ -569,10 +586,6 @@ parseRepecPreprints <- function(item) {
569
586
start_date <- sample_date_from
570
587
end_date <- sample_date_until
571
588
572
- start_date <- as.Date(" 2021-11-01" )
573
- end_date <- as.Date(" 2021-12-01" )
574
-
575
-
576
589
getRepecPreprints <- function (start_date , end_date ) {
577
590
d <- oai :: list_records(" http://oai.repec.org" ,
578
591
from = start_date ,
@@ -623,6 +636,9 @@ covid_preprints_update <- covid_preprints_update %>%
623
636
abstract = str_squish(abstract )) %> %
624
637
mutate(title = str_remove_all(title , " <.*?>" ),
625
638
title = str_squish(title ))
639
+
640
+
641
+ rm(cr_covid , dc_covid , ar_covid , repec_covid )
626
642
```
627
643
628
644
\# Remove duplicate records (incl. versions) on same preprint server
@@ -672,6 +688,8 @@ covid_preprints <- covid_preprints %>%
672
688
673
689
covid_preprints %> %
674
690
write_csv(" data/covid19_preprints.csv" )
691
+
692
+ rm(covid_preprints_previous , covid_preprints_update )
675
693
```
676
694
677
695
# Create metadata file (json file with sample date and release date)
@@ -715,7 +733,7 @@ palette <- c(pal_1, pal_2)
715
733
716
734
``` r
717
735
# Minimum number of preprints to be included in graphs (otherwise too many categories/labels is confusing. Aim for 19 servers to include.)
718
- n_min <- 150
736
+ n_min <- 175
719
737
720
738
# Repositories with < min preprints
721
739
other <- covid_preprints %> %
0 commit comments