arxiv-browse/script/timepdf_dl.py at develop · arXiv/arxiv-browse · GitHub

Name: arxiv-browse/script/timepdf_dl.py at develop · arXiv/arxiv-browse · GitHub
Rating: 4.6 (7182 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
Time the following for the paper ids in the file in arg1:

Get the pdfs from the cdn,
get the pdfs from beta.arxiv.org as current version
get the pdfs from bet.arxiv.org with version number
"""


importrequests
importsys
fromtimeimportperf_counter, sleep
importre
importpandasaspd

urls= []
withopen(sys.argv[1], 'rt') asfh:
urls=fh.readlines()

urls= [url.strip() forurlinurls]
reex=re.compile(r'arxiv\/(?P<id>.*\.pdf)')
pdfs= [url.replace('gs://arxiv-production-ps-cache/pdf/arxiv/', '') forurlinurls]

defid_to_beta_with_v_url(id):
returnf'https://beta.arxiv.org/pdf/{id}', []

_without_v=re.compile(r'(\d{4}.\d{4})(v\d*).pdf')


defid_to_id_and_v(id):
match=_without_v.search(id)
ifnotmatch:
raiseValueError(f"ID id did not match regex")
else:
idpart=match[1]
vpart=match[2]
return (idpart, vpart)

defid_to_beta_without_v_url(id):
idpart, _=id_to_id_and_v(id)
returnf'https://beta.arxiv.org/pdf/{idpart}.pdf', []

defid_to_cdn_url(id):
returnf'https://download.arxiv.org/pdf/arxiv/{id}', []


ARXIV_HEADERS= {'User-Agent':'periodic-rebuild'}


defid_to_arxiv_with_v_url(id):
returnf'https://arxiv.org/pdf/{id}', ARXIV_HEADERS

defid_to_arxiv_without_v_url(id):
returnf'https://arxiv.org/pdf/{id}', ARXIV_HEADERS

tests= [
 ('beta version download', id_to_beta_with_v_url,
"""Requests to beta.arxiv.org/pdf with the version number."""),
 ('beta current download', id_to_beta_without_v_url,
"Requests to bet.arxiv.org/pdf without a version number."),
 ('CDN download', id_to_cdn_url,
"Requests to CDN at download.arxiv.org/pdf"),
 ('CDN download 2nd pass', id_to_cdn_url,
"Requests to CDN 2nd run after cache has been warmed"),
 ('arxiv.org version download', id_to_arxiv_with_v_url,
"Reqeusts to arxiv.org/pdf with a version number."),
 ('arxiv.org current download', id_to_arxiv_without_v_url,
"Reqeusts to arxiv.org/pdf without a version number."),
 ]

data={'pdfs': pdfs}

verbose=1
fortest_name, req_fn, descintests:
print(f'Starting "{test_name}"')
timings= []
bytes_per_sec= []
responses= {}
data[test_name] =dict(timings=timings, desc=desc, responses=responses)

forpdfinpdfs:
url, headers=req_fn(pdf)
start=perf_counter()
bytes=0
ifverbose:
print(f'about to get {url}')
else:
print('.', end='')
try:
resp=requests.get(url, headers=headers)
bytes=len(resp.content)
responses[pdf] =dict(url=url, headers= {key:resp.headers[key] forkeyinresp.headers},
status_code=resp.status_code)
exceptExceptionasex:
responses[pdf] =dict(url=url, error=f"{test_name}{url} failed with {ex}")

dt=perf_counter() -start
timings.append(dt)
responses[pdf]['time'] =dt

bps=bytes/dt
bytes_per_sec.append(bps)
responses[pdf]['bytes_per_sec'] =bps

sleep(0.5)

data[test_name]['summary_dt'] =pd.DataFrame(timings).describe().to_dict()
data[test_name]['summary_byte_per_sec'] =pd.DataFrame(bytes_per_sec).describe().to_dict()

importjson
data_json=json.dumps(data, sort_keys=True, indent=4)

withopen('results.json','wt') asdf:
df.write(data_json)