- Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathjournal_export.py
118 lines (83 loc) · 3.7 KB
/
journal_export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
importgzip
importtempfile
importboto
importunicodecsv
fromsqlalchemyimportsql
fromappimportdb
OA_STATS_FILE='journal_open_access.csv.gz'
REPO_FILE='repositories.csv.gz'
REQUESTS_FILE='extension_requests.csv.gz'
ISSNS_FILE='crossref_issns.csv.gz'
def_write_oa_stats_csv():
# get oa stats by color
stats_fields= [
'issn_l', 'title', 'year',
'num_dois', 'num_open', 'open_rate',
'num_green', 'green_rate',
'num_bronze', 'bronze_rate',
'num_hybrid', 'hybrid_rate',
'num_gold', 'gold_rate',
]
csv_fields=stats_fields+ ['is_in_doaj', 'is_gold_journal']
journal_stats_rows=db.engine.execute(
sql.text('''
select {stats_fields}, doaj.issn_l is not null as is_in_doaj, oa.issn_l is not null as is_gold_journal
from oa_rates_by_journal_year
left join oa_issn_l_years oa using (issn_l, year)
left join doaj_issn_l_years doaj using (issn_l, year)
where issn_l is not null and year is not null
'''.format(stats_fields=', '.join(stats_fields)))
).fetchall()
journal_dicts= [dict(list(zip(csv_fields, row))) forrowinjournal_stats_rows]
csv_filename=tempfile.mkstemp()[1]
withgzip.open(csv_filename, 'wb') ascsv:
writer=unicodecsv.DictWriter(csv, csv_fields, dialect='excel', encoding='utf-8')
writer.writeheader()
forrowinjournal_dicts:
writer.writerow(row)
returncsv_filename
def_write_repo_csv():
rows=db.engine.execute(sql.text('''
select issn_l, endpoint_id, r.repository_name, r.institution_name, r.home_page, e.pmh_url, num_articles
from num_articles_by_journal_repo
join endpoint e on e.id = endpoint_id
join repository r on e.repo_unique_id = r.id
''')).fetchall()
csv_filename=tempfile.mkstemp()[1]
withgzip.open(csv_filename, 'wb') ascsv:
writer=unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(('issn_l', 'endpoint_id', 'repository_name', 'institution_name', 'home_page', 'pmh_url', 'num_articles'))
forrowinrows:
writer.writerow(row)
returncsv_filename
def_write_crossref_issn_csv():
issns=db.engine.execute(sql.text('select issn from crossref_issn')).fetchall()
csv_filename=tempfile.mkstemp()[1]
withgzip.open(csv_filename, 'wb') ascsv:
writer=unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(['issn'])
forissninissns:
writer.writerow(issn)
returncsv_filename
def_write_requests_csv():
rows=db.engine.execute(sql.text('select month, issn_l, requests from extension_journal_requests_by_month')).fetchall()
csv_filename=tempfile.mkstemp()[1]
withgzip.open(csv_filename, 'wb') ascsv:
writer=unicodecsv.writer(csv, dialect='excel', encoding='utf-8')
writer.writerow(('month', 'issn_l', 'requests'))
forrowinrows:
writer.writerow(row)
returncsv_filename
def_upload_journal_file(filename, object_name):
s3=boto.connect_s3(host='s3-us-west-2.amazonaws.com')
bucket=s3.get_bucket('unpaywall-journal-csv')
boto.s3.key.Key(bucket=bucket, name=object_name).set_contents_from_filename(filename, replace=True)
defget_journal_file_key(filename):
s3=boto.connect_s3()
bucket=s3.get_bucket('unpaywall-journal-csv')
returnbucket.lookup(filename)
if__name__=="__main__":
_upload_journal_file(_write_oa_stats_csv(), OA_STATS_FILE)
_upload_journal_file(_write_repo_csv(), REPO_FILE)
_upload_journal_file(_write_requests_csv(), REQUESTS_FILE)
_upload_journal_file(_write_crossref_issn_csv(), ISSNS_FILE)