I am using a python scraper code to grab publicly available data from http://103.48.16.132/echalan/
but it takes almost ~6gb of memory and more cpu. I need to run multiple instance of the this code which is not possible thusly. Can anyone suggest me a tweak or edit to lessen the memory and cpu footprint of this code. I already added gc
and deleted unused variable but all in vain.
Workflow of the code roughly:
- Generate date from a give date range and iterate over each date for multiple challan no for each bank branch to collect data.
- Save data to a dataframe and finally to csv Save the collected data ID and no-data ID to a file with extension
.dat
- If resumed then analyse the done and no-data IDs and proceed with the not-done IDs.
Code I use-
# -*- coding: utf-8 -*- import warnings import re warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.simplefilter(action='ignore', category=FutureWarning) import requests,sys,os from datetime import datetime from itertools import * import time,gc import pandas as pd import lxml.html as LH from datetime import timedelta, date import pathlib from collections.abc import Iterable import requests,aiohttp,asyncio start_timeTot = time.time() bank_brnch = {'2573838': 'Agargaon (SB)', '42238412': 'AGLA, DHAKA', '41829141': 'Agrani Balika Bidyalaya (SB)', '42238417': 'AMIN BAZAR, DHAKA', '42238434': 'ARMANITOLA, DHAKA', '44205532': 'ASHULIA BAZAR', '42238436': 'ATI BAZAR, DHAKA', '42238443': 'AWLAD HOSSAIN MARKET, DHAKA', '1217867': 'B.B. Avenue Corp,Dhaka (SB)', '42238447': 'B.I.S.E., DHAKA', '42238448': 'B.M.E. BOARD, DHAKA', '42238451': 'B.U.E.T., DHAKA', '42238452': 'BABUBAZAR, DHAKA', '1218486': 'Badda, Dhaka (SB)', '1218487': 'Baitul Mokarrom,Dhaka (SB)', '42238453': 'BAJME KADERIA COMPLEX, DHAKA', '42238455': 'BANANI BAZAR, DHAKA', '1218488': 'Banani, Dhaka (SB)', '42238461': 'BANGA BANDHU JATIO STADIUM, DHAKA', '42238466': 'BANGA BHABAN, DHAKA', '2549487': 'Baridhara (SB)', '42238480': 'BASABO, DHAKA', '42238486': 'BAWANINAGAR, DHAKA', '1218708': 'Begum Rokeya Sarani,Dhaka (SB)', '1218720': 'Chawk Bazar,Dhaka', '42238489': 'CHURAIN, DHAKA', '41458064': 'COLLEGE GATE (SB)', '1218496': 'Custom House, Dhaka', '1218721': 'D.C.Hall, Dhaka', '1218715': 'D.E.P.Z,Dhaka', '1218489': 'Dhaka Cantt., Dhaka (SB)', '1218497': 'Dhaka Registration Com.,Dhaka', '41614746': 'DHAKA UNIVERSITY CAMPUS (SB)', '115': 'Dhamrai (SB)', '3649899': 'Dhanmondi Corp. (SB)', '1218502': 'Dilkusha Corp.Br., Dhaka (SB)', '42238494': 'DISTILARY ROAD, DHAKA', '1218500': 'Doyagonj, Dhaka', '1218503': 'Fakirapool,Dhaka (SB)', '1857462': 'Farash gonj, Dhaka (SB)', '1218490': 'Farmgate, Dhaka (SB)', '42238498': 'FOREIGN EXCHANGE CORPORATE, DHAKA', '42238501': 'GANA BHABAN, DHAKA', '42238505': 'GORAN, DHAKA', '42238507': 'GREEN ROAD, DHAKA', '42070627': 'GULSHAN (SB)', '1218491': 'Gulshan New North,Dhaka (SB)', '42238511': 'HAZARIBAG, DHAKA', '41293811': 'HAZRAT SHAHJALAL INTL AIRPORT', '42238512': 'Hotel Inter-Continental Br(SHERATAN),DHAKA', '42238517': 'IBRAHIMPUR, DHAKA', '42238524': 'ISHWARCHANDRA STREET, DHAKA', '36250033': 'JATIO SANGSAD BHABAN BR.', '1218651': 'Jatrabari, Dhaka (SB)', '417': 'Joypara (SB)', '1218696': 'Kakrail,Dhaka (SB)', '42238528': 'KALAKOPA, DHAKA', '42238533': 'KALAMPUR, DHAKA', '42238536': 'KALATIA, DHAKA', '41839603': 'KALYAN PUR (SB)', '5602261': 'Kamlapur Rly. St. ICD Br.', '42238538': 'KAWRAN BAZAR, DHAKA,SB', '418': 'Keraniganj (SB)', '1218654': 'Khilgaon, Dhaka (SB)', '42143382': 'KRISHI BAZAR MOHAMMADPUR', '41751373': 'KRISHI BHABAN (SB)', '42238541': 'KURMITOLA, DHAKA', '1218723': 'Lalbagh,Dhaka (SB)', '1218698': 'Lalmatia,Dhaka (SB)', '1857477': 'Laxmi Bazar, Dhaka (SB)', '1217860': 'Local Office,Dhaka', '42238544': 'MAKIM KATRA, DHAKA', '1218656': 'Malibagh,Dhaka (SB)', '42241715': 'MANIK MIAH AVENUE, DHAKA', '1218700': 'Md.Pur Bazar, Dhaka (SB)', '42238546': 'MIRPUR CANTT., DHAKA', '1218711': 'Mirpur I/A, Dhaka', '2717246': 'Mirpur Sec-1', '42238547': 'MITFORD ROAD, DHAKA', '1218493': 'Mogh Bazar, Dhaka (SB)', '1218494': 'Mohakhali, Dhaka (SB)', '1218498': 'N.C.T.B,Dhaka (SB)', '2549438': 'Nagar Bhabon (SB)', '42238548': 'NAJIRABAZAR, DHAKA', '41829146': 'Naval H/Q (SB)', '419': 'Nawabganj (Dhaka)', '1218724': 'Nawabpur Road,Dhaka', '42238563': 'NAYABAZAR, DHAKA', '42238570': 'NAYARHAT, DHAKA', '1218762': 'Nazimuddin Road, Dhaka (SB)', '1218665': 'New Market, Dhaka', '2452744': 'North South Road Br. Dhaka (SB)', '42238573': 'P.A.T.C. (SAVAR), DHAKA', '42238574': 'PALAMGANJ, DHAKA', '1218699': 'Pallabi Br. (Mirpur-12 ), Dhaka', '44332559': 'PANGAON ICT BR.', '1218725': 'Postagola,Dhaka (SB)', '41581585': "PRIME MINISTER'S OFFICE (SB)", '40338614': 'Public Service Commission Branch (Dhaka Airport Branch)', '42238578': 'RAJUK BHABAN, DHAKA', '4039439': 'Ramna Corporate Branch (SB)', '42238581': 'RAMPURA, DHAKA', '42238583': 'RASULPUR BAZAR, DHAKA', '42238588': 'RUHITPUR, DHAKA', '1218726': 'Sadarghat Corp. Br,Dhaka (SB)', '42238593': 'SAIDABAD BUS TERMINAL, DHAKA', '1218701': 'Sat Masjid, Dhaka (SB)', '325': 'Savar (SB)', '1218702': 'Savar Cantt.,Dhaka (SB)', '41423293': 'SEGUN BAGICHA (SB)', '41501647': 'Shahjanpur (SB)', '1218659': 'Shilpa Bhaban,Dhaka (SB)', '1218704': 'Sonargaon Road,Dhaka (SB)', '42139442': 'Sonargoan Hotel (SB)', '1218706': 'Supreme Court,Dhaka (SB)', '42238602': 'TEJGAON INDUSTIAL AREA, DHAKA', '42238606': 'URDU ROAD, DHAKA', '41583041': 'UTTAR KHAN', '41582663': 'UTTARA MODEL TOWN (SB)', '41426798': 'VIQUARUN NESA NOON SCHOOL (SB)', '41660316': 'Wage Earners Corporate (SB)', '2452798': 'WAPDA Building Br.', '1218750': 'Wari, Dhaka (SB)', '1218695': 'Zigatola,Dhaka (SB)'} ouputpath = os.path.join(os.path.dirname(sys.argv[0]),'C_{0}'.format(time.strftime("%Y%d%m")))# merge_folder_csv_after_run = False batch_size_for_async_request = 1000 time_out_for_request_wait = 300 process_missed_accounts_flag = True headerWriteFlag = True donelist = missedlist = noaclist= whataclist = dfschallan = [] start_sd_index = 0 #end_sd_index = len(accounts) accountErrorFlag = [] switch_code_if_not_found = 20000 total_no_acc_to_change = [] switch_code_if_not_found_in_total = 50000 filepath = "" def wait_for_internet_connection(): while True: try: response = requests.get('https://www.google.com/?hl=bn',timeout=5) if response.ok: return except Exception: time.sleep(5) print("Waited for internet to connect {}.".format(datetime.now())) pass wait_for_internet_connection() try: with open('missedCHALLAN.dat','r') as f: missedlist = f.readlines() missedlist = list(filter(None,list(set(list(map(str.strip,missedlist)))))) except: pass try: with open('noacCHALLAN.dat','r') as f: noaclist = f.readlines() noaclist = list(filter(None,list(set(list(map(str.strip,noaclist)))))) except: pass try: with open('whatacCHALLAN.dat','r') as f: whataclist = f.readlines() whataclist = list(filter(None,list(set(list(map(str.strip,whataclist)))))) except: pass try: with open('doneCHALLAN.dat','r') as f: donelist = f.readlines() donelist = list(filter(None,list(map(str.strip,donelist)))) #donelistSet = set(done_list)# for faster performance convert to set #TINSUnique = [tn__ for tn__ in uTINS if tn__ not in donelistSet] #TINS = TINSUnique except: pass #prevent duplicate header write if len(donelist)>0: headerWriteFlag = False def daterange(date1, date2): for n in range(int ((date2 - date1).days)+1): yield date1 + timedelta(n) def _strftime(date): return date.strftime(DATE_FORMAT) def flatten(xs): for x in xs: if isinstance(x, Iterable) and not isinstance(x, (str, bytes)): yield from flatten(x) else: yield x def _date_range_parameters(start, end, span_days): start = _strptime(start) end = _strptime(end) span = timedelta(days=span_days) return start, end, span #File and folder name sanitizer def sanitize_file_folder_name(ffname): reserved_chars = [':','>','<','"','/','\\','*','?','|'] for rc in reserved_chars: ffname = ffname.replace(rc,'_').strip() return ffname def forward_date_range(start, end, span_days): """ Generate tuples with intervals from given range of dates (forward). forward_date_range('2012-01-01', '2012-01-5', 2) 1st yield = ('2012-01-01', '2012-01-03') 2nd yield = ('2012-01-04', '2012-01-05') """ start, end, span = _date_range_parameters(start, end, span_days) stop = end - span while start < stop: current = start + span yield _strftime(start), _strftime(current) start = current + DATE_STEP yield _strftime(start), _strftime(end) s def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n] def dedeuper(seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] async def fetch(acc_,url_,payld_,timout,hdr,check_test_lst,no_acc_check_lst): dt = [] try: #connector = aiohttp.TCPConnector(limit=10,force_close=True) #challanno = payld_['chalan_no'] async with aiohttp.ClientSession() as session: unqID = acc_+"_"+str(payld_['chalan_no']) async with session.post(url_,headers=hdr,data = payld_,timeout=timout) as response: resp = await response.read() #resp = await response.text() root = LH.fromstring(resp) txt = root.text_content() tds = root.xpath("//td") #print(root.text_content()) #AccountName = root.xpath("((//div[contains(@class,'col-lg-9 col-md-9')]//table)[3]//div)[1]")[0].text.strip() if all([i not in txt for i in check_test_lst]): #print(txt) dt_dict = [{'UniqueID':acc_, 'challan_no':tds[2].text_content().split(":")[-1].strip(), 'date' : tds[3].text_content().split(":")[-1].strip(), 'bank':tds[5].text_content().split(":")[-1].strip(), 'branch' : tds[6].text_content().split(":")[-1].strip(), 'code' : tds[7].text_content().split(":")[-1].strip(), 'name' : tds[17].text_content().split(":")[-1].strip(), 'Amount' : tds[19].text_content().split(":")[-1].strip(), 'Timestamp':str(datetime.now())}] dfName1 = pd.DataFrame(dt_dict) dt = [dfName1,acc_] print("Successfully got account {} with response of length {}.".format(unqID, len(resp))) elif all([i in txt for i in no_acc_check_lst]): dt = ['NoAC',acc_] print("No account found for {}.".format(unqID)) except Exception as e: #pass print("Unable to get account {} due to {}.".format(unqID, e.__class__)) dt = ['Error',acc_] print("Error while trying to collect for {}.".format(unqID)) return dt async def get(_brcd,_postQry,_dat): #print('Working on {}'.format(_dat)) LOGIN_URL = 'http://103.48.16.132/echalan/VerifyChalan_new.php' #'https://CHALLAN.org.bd/service/ebill' dt_get = [] dtframes = [] challan_miss_counter = [] time_out_for_request_wait = 3 headers = { 'Accept': 'text/javascript, text/html, application/xml, text/xml, */*', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive', 'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Origin': 'http://103.48.16.132', 'Referer': 'http://103.48.16.132/echalan/echalan_iframe.php', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36', 'X-Prototype-Version': '1.6.1', 'X-Requested-With': 'XMLHttpRequest', } challan_nos = list(range(1,99999999)) breakwhile = False while True: #global dt_get for challan_no in challan_nos: payload = { 'bank_branch_id': _brcd, 'chalan_date': _dat, 'chalan_no': challan_no, 'trans_type' : _postQry, 'counter_no': '0', 'bank_id' : '2', '_' : '' } uniqIDforDATE = _brcd+"_"+_postQry+"_"+_dat #pass the weekends #if parser.parse(_dat).weekday() in [4,5]: #continue dt_get = await fetch(uniqIDforDATE,LOGIN_URL,payload,time_out_for_request_wait,headers,['Chalan not found'],['Chalan not found']) #time.sleep(random.uniform(0.2,0.8)) if 'NoAC' in dt_get[0]: challan_miss_counter.append('bypass') elif isinstance(dt_get[0],pd.DataFrame): dtframes.append(dt_get[0]) if len(challan_miss_counter)>maxCheckToPassSingleDate: breakwhile = True break elif isinstance(dt_get[0],pd.DataFrame): challan_miss_counter = [] #safety measure for not data if len(dt_get) < 2: dt_get = ['What',uniqIDforDATE] if breakwhile: break if len(dtframes): dfconcated = pd.concat(dtframes) dt_get = [dfconcated,uniqIDforDATE] return dt_get async def main(br_code,post_qry_type,date_batch): global accountErrorFlag global total_no_acc_to_change global headerWriteFlag #wait for internet connection wait_for_internet_connection() dones,pendings = await asyncio.wait([get(br_code,post_qry_type,dat) for dat in date_batch]) #print("Finalized all. ret is a list of len {} outputs.".format(len(dones))) data_results = [i.result() for i in dones] dfs = [i[0] for i in data_results if isinstance(i[0],pd.DataFrame)] if len(dfs)>0: dfNameConcated = pd.concat(dfs) dfschallan.append(dfNameConcated) name_suffix_done = "_"+sanitize_file_folder_name(bank_brnch[br_code]+"_"+dfNameConcated['UniqueID'].unique().tolist()[0]+"_") #Create a folder curdir = pathlib.Path().absolute() flderpath = curdir.joinpath(sanitize_file_folder_name(bank_brnch[br_code])) flderpath.mkdir(parents=True, exist_ok=True) nam = '_{0}_{1}'.format(post_qry_type,time.strftime("%d%m%Y"))+name_suffix_done+'.csv' filepath = curdir.joinpath(sanitize_file_folder_name(bank_brnch[br_code]))/nam dfNameConcated.to_csv(filepath,encoding='utf-8-sig',index=False,mode='a',header=headerWriteFlag) done_accounts = sorted(dfNameConcated['UniqueID'].unique()) #write done accounts df_done = pd.DataFrame(done_accounts, columns=["done_accounts"]) done_file_name = 'doneCHALLAN.dat' df_done.to_csv(done_file_name, mode = 'a', index=False,header=False) headerWriteFlag = False no_ac_data = [i[0] for i in [i for i in data_results if not isinstance(i[0],pd.DataFrame)] if i[0] == 'NoAC'] if len(no_ac_data) == batch_size_for_async_request: accountErrorFlag = accountErrorFlag+no_ac_data elif len(dfs)>0: accountErrorFlag=[] if len(no_ac_data)>0: total_no_acc_to_change = total_no_acc_to_change+no_ac_data missed_accounts = [i[1] for i in [i for i in data_results if not isinstance(i[0],pd.DataFrame)] if i[0] == 'Error'] #write missed accounts df_missed = pd.DataFrame(missed_accounts, columns=["missed_accounts"]) missed_csv_name = 'missedCHALLAN.dat' df_missed.to_csv(missed_csv_name,mode = 'a', index=False,header=False) #process no accounts noac_accounts = [i[1] for i in [i for i in data_results if not isinstance(i[0],pd.DataFrame)] if i[0] == 'NoAC'] #write missed accounts df_noac = pd.DataFrame(noac_accounts, columns=["no_accounts"]) noac_csv_name = 'noacCHALLAN.dat' df_noac.to_csv(noac_csv_name,mode = 'a', index=False,header=False) #process what accounts whatac_accounts = [i[1] for i in [i for i in data_results if not isinstance(i[0],pd.DataFrame)] if i[0] == 'What'] #write missed accounts df_whatac = pd.DataFrame(whatac_accounts, columns=["what_accounts"]) whatac_csv_name = 'whatacCHALLAN.dat' df_whatac.to_csv(whatac_csv_name,mode = 'a', index=False,header=False) print("Acounts collection perfomance=========================noac/missed/done/what = {0} / {1} / {2} / {3}===========================\ ".format(len(no_ac_data),len(missed_accounts), len(dfs),len(whatac_accounts))) total_accounts_scraped = 0 xSearchDateList = [] #accounts_ = accounts[start_sd_index:end_sd_index] accounts_offices = list(bank_brnch.items()) for accounts_office in accounts_offices: dtS_ = '2013-01-01' #(Y-M-D) start date dtE_ = '2022-06-30' #end date dateChunkSixe = 1 #fix 1 for better performance maxCheckToPassSingleDate = 500 batch_size_for_async_request = 1 postQueryTypes = ['C','L'] #argmnts = list(forward_date_range(dtS_, dtE_, dateChunkSixe)) #argmnts = list(set(argmnts)) searchable_dates =[] dtS = time.strptime(dtS_, '%Y-%m-%d') dtE = time.strptime(dtE_, '%Y-%m-%d') dt_ranges = [date(dtS.tm_year, dtS.tm_mon, dtS.tm_mday),date(dtE.tm_year, dtE.tm_mon, dtE.tm_mday)] if isinstance(xSearchDateList,list) and len(xSearchDateList)>0: searchable_dates = xSearchDateList elif len(dt_ranges)>0 and dt_ranges[0].year != 1900 : for dt in daterange(dt_ranges[0], dt_ranges[1]): searchable_dates.append(dt.strftime("%d-%m-%Y")) else: raise Exception searchable_dates_weekdays = searchable_dates #[onday for onday in searchable_dates if parser.parse(onday).weekday() not in [4,5]] total_accounts_scraped=0 for bank_br in bank_brnch.items(): for postQueryType in postQueryTypes: dfschallan = [] branchCode = bank_br[0] accountsOfficeName = bank_br[-1] dones = [donedate.split("_")[-1] for donedate in donelist if branchCode+"_"+postQueryType in donedate] noacs = [donedate.split("_")[-1] for donedate in noaclist if branchCode+"_"+postQueryType in donedate] D1 = set(dones) D2 = set(noacs) D = D1.union(D2) searchable_dates_not_dones = [sd for sd in searchable_dates if sd not in D]+missedlist date_batches = [chunk for chunk in chunks(searchable_dates_not_dones,batch_size_for_async_request)]#[:1] del D1,D2,noacs,dones,searchable_dates_not_dones for batch in date_batches: start_time = time.time() asyncio.get_event_loop().run_until_complete(main(branchCode,postQueryType,batch)) end_time = time.time() total_accounts_scraped += len(batch) print("Took {} seconds to pull {} accounts.....................................................\ ".format(end_time - start_time, total_accounts_scraped)) gc.collect() #Write for each branch code for each postQuery Type if merge_folder_csv_after_run: date_batches = searchable_dates_weekdays elif len(date_batches)<1: continue curdir = pathlib.Path().absolute() bank_folder = sanitize_file_folder_name(bank_brnch[branchCode]) csv_paths = curdir.joinpath(bank_folder) flat_list_dates_done = list(flatten(date_batches)) file_name = sanitize_file_folder_name(bank_folder+"_"+postQueryType+"_"+flat_list_dates_done[0]+"_to_"+flat_list_dates_done[-1])+"_"+'.csv' write_path = csv_paths.joinpath(file_name) column_headers = ['UniqueID', 'challan_no', 'date', 'bank', 'branch', 'code', 'name', 'Amount', 'Timestamp'] matched_csvs=[] pattern_sample = '{}_{}_{}'.format(bank_folder,branchCode,postQueryType) for item in csv_paths.glob(r'**/*'): if pattern_sample in str(item): # retrieve the groups of interest matched_csvs.append(str(item)) if len(matched_csvs)>0: dfcsv_br_code = pd.concat([pd.read_csv(f,header=None,names=column_headers) for f in list(set(matched_csvs))],axis=0,ignore_index=True) dfcsv_br_code_unique = dfcsv_br_code.drop_duplicates(subset=column_headers[1:-1], keep='first') #remove any row contains bank dfcsv_br_code_unique = dfcsv_br_code_unique[dfcsv_br_code_unique['bank'] != 'bank'] dfcsv_br_code_unique.to_csv(write_path,encoding='utf-8-sig',index=False,mode='w',header=True) ##Write save df while running try: flat_list_dates_done = list(flatten(date_batches)) file_name = sanitize_file_folder_name(bank_brnch[branchCode]+"_"+postQueryType+"_"+flat_list_dates_done[0]+"_to_"+flat_list_dates_done[-1])+"_"+'.csv' if len(dfschallan)>0: dfschallanConcated = pd.concat(dfschallan) dfschallanConcated.to_csv(file_name,encoding='utf-8-sig',index=False,mode='w',header=True) except: pass gc.collect()