- Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathanalyze_warnings.py
82 lines (58 loc) · 2.48 KB
/
analyze_warnings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Compare to legacy, the warnings in a json file as downloaded from
GCP log explorer.
Run as
python scripts/analyze_warnings.py 404_log_export_from_GCP.json
If you take the GCP log explorer and get all the WARNING log lines from the
cloud run service, you can run them though this script.
It will give a summary by status with examples.
Then it will take all the 404 responses and try to get the from
export.arxiv.org. The response from GCP will be compared with the ones
from export to let you know if they are real 404s, missing from the
sync, withdrawn (aka no_author_source) or unavailable.
"""
importsys
importre
importjson
frompathlibimportPath
fromcollectionsimportdefaultdict
importrequests
print(f"* Analysis of {sys.argv[1]}")
withopen(sys.argv[1]) asfh:
data=json.load(fh)
print(f"Number of rows: {len(data)}")
bystatus=defaultdict(list)
forrowindata:
bystatus[row['httpRequest']['status']].append(row)
forkeyinbystatus:
print(f"{key}: {len(bystatus[key])} responses")
forkeyinbystatus:
print(f"Examples of {key} responses")
forrowinbystatus[key][0:3]:
print(" - "+row['httpRequest']['requestUrl'])
ENSURE_UA='periodic-rebuild'
print("Checking if 404s exist on legacy")
session=requests.Session()
session.headers=headers= {
'User-Agent': ENSURE_UA,
'Accept': '*/*'
}
responses= {}
forrowinbystatus[404]:
url=row['httpRequest']['requestUrl'].replace('download.', 'export.')
resp=session.get(url, allow_redirects=True)
is_pdf=bool('pdf'inresp.headers['content-type'])
unavailable=bool('PDF unavailable for'inresp.text)
no_author_source=bool('The author has provided no source'inresp.text)
print(f"{url}: {resp.status_code} is_pdf: {is_pdf} content-type: {resp.headers['content-type']} unavailable: {unavailable}")
responses[url] =dict(status_code=int(resp.status_code),
is_pdf=is_pdf,
unavailable=unavailable,
no_author_source=no_author_source,
)
withopen(Path('404_analysis.json'),'w') asfh:
json.dump(responses, fh, indent=2)
non200= [itemforiteminresponses.values() ifitem['status_code'] !=200]
unavailable= [itemforiteminresponses.values() ifitem['status_code'] ==200anditem['unavailable']]
print(f"resonse from legacy was non-200: {len(non200)}")
print(f"resonse from legacy was 200 but pdf was unavailable: {len(unavailable)}")
print("DONE\n")