- Notifications
You must be signed in to change notification settings - Fork 164
/
Copy pathmemrise_scraper.py
66 lines (60 loc) · 1.96 KB
/
memrise_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
frombs4importBeautifulSoup
importurllib.requestasreq
importtime
memrise_url="https://www.memrise.com/course/121215/barrons-800-essential-word-list-gre/"
sentences_url="http://sentence.yourdictionary.com/"
mapping=dict()
defwrite_to_file(combine, sentence_map, mapping):
withopen("memrise_mapper_clean.txt", 'a') asmfile:
foreleincombine:
ifele[0] inmapping.keys():
writetotxt=str(ele[0]) +" => "+str(ele[1]) +"\n"
#mfile.write("!-------------------!\n")
mfile.write(writetotxt)
ifele[0] insentence_map.keys() andlen(sentence_map[ele[0]]) !=0:
forsinsentence_map[ele[0]]:
mfile.write(s+"\n")
mfile.write("!-------------------!\n\n")
else:
mfile.write("No sentences.. :( \n")
mfile.write("!-------------------!\n\n")
defget_sentences(ur):
url=ur
print(url)
response=req.urlopen(url)
data=response.read()
soup=BeautifulSoup(data, 'lxml')
sentences= []
forsinsoup.find_all("li", class_='voting_li'):
sentence=s.find("div", class_='li_content')
#print(sentence.text)
sentences.append(sentence.text)
returnsentences
if__name__=='__main__':
foriinrange(1,81):
url=memrise_url+str(i) +"/"
response=req.urlopen(url)
data=response.read()
soup=BeautifulSoup(data,'lxml')
combine= []
sentence_map=dict()
forwinsoup.find_all("div", class_="thing text-text"):
word=w.find("div", class_="col_a col text")
meaning=w.find("div", class_="col_b col text")
sentence=sentences_url+word.text
time.sleep(1.5)
try:
sentences=get_sentences(sentence)
sentences=sentences[:3]
exceptException:
pass
combine.append((word.text, meaning.text))
sentence_map[word.text] =sentences
ifword.textnotinmapping:
mapping[word.text] =meaning.text
antimapping=dict((y, x) forx,yinmapping.items())
write_to_file(combine, sentence_map, mapping)
sentence_map.clear()
combine.clear()
time.sleep(5)
#break