I'm parsing out specific values on web pages with BeautifulSoup. However, since I'm using RegEx, my program is taking forever to run. Would love ideas on how to speed this up.
from bs4 import BeautifulSoup import datetime import json from progressbar import progressbar import pdb import pickle import re class Listing(): def __init__(self, custom_name, **entries): self.__dict__.update(entries) self.custom_name = custom_name self.date_accessed = datetime.datetime.today() def __hash__(self): return hash(self.custom_name) def __eq__(self, other): return self.custom_name == other.custom_name def __repr__(self): return self.custom_name def list_to_dict(rlist): # QUEST: There are multiple colons in many of the entries. I couldn't # figure out how to use re.split where it only split the first occurence # so instead I replace only the first occurence and then split that new str list_with_replace_str = [re.sub(":", ":REPLACE", e, 1) for e in rlist] temp_dict = dict(f.split(":REPLACE") for f in list_with_replace_str) clean_dict = {} for key in temp_dict.keys(): clean_key = key.strip() clean_value = temp_dict[key].strip() clean_dict[clean_key] = clean_value return clean_dict def parse_listings(listing_objs): def parse_financials_div(financials_soup, listing_obj): try: financials_text = financials_soup.text financials_list = financials_text.split("\r\n")[:-1] financials_dict = list_to_dict(financials_list) not_included = [] for key in financials_dict: if "*" in financials_dict[key]: not_included.append(key) financials_dict["notIncluded"] = not_included for key in financials_dict: try: financials_dict[key] = int( re.sub("[^0-9]", "", financials_dict[key])) except Exception: continue return financials_dict except Exception as e: print(f"error {e}") pdb.set_trace() def parse_details_div(details_soup, listing_obj): try: details_tag_list = details_soup.contents details_str = " ".join([str(element) for element in details_tag_list]) details_list = details_str.split("<dt>")[1:] strs_to_tags = [BeautifulSoup(detail, "html.parser") for detail in details_list] details_text = [tag.text for tag in strs_to_tags] details_dict = list_to_dict(details_text) return details_dict except Exception as e: print(f"error {e}") pdb.set_trace() def parse_category(product_json_soup, listing_obj): product_json_str = product_json_soup.contents[0].replace( "\r", "").replace("\n", "") product_json_str = product_json_str.replace( "\'", "").replace('\\"', '').replace("\t", "") product_dict = json.loads(product_json_str) category_str = product_dict["category"] category_list = category_str.split(">") category_list = [category.strip() for category in category_list] listing_obj.category = {} listing_obj.category["parent_category"] = category_list[0] try: listing_obj.category["sub_category"] = category_list[1] except Exception: listing_obj.category["sub_category"] = "Not Present" def parse_address(address_json_soup, listing_obj): address_json_str = address_json_soup.contents[0].replace( "\r", "").replace("\n", "") address_json_str = address_json_str.replace( "\'", "").replace('\\"', '').replace("\t", "") address_dict = json.loads(address_json_str) listing_obj.address = address_dict["address"] # Parse available listing fields into a dict print("Parse financials and details for listings") for listing_obj in progressbar(listing_objs): try: index = listing_objs.index(listing_obj) length = len(listing_objs) soup = BeautifulSoup(listing_obj.response_text, "html.parser") # Parse category category_json_pattern = re.compile(r"\"@type\" : \"Product\"") category_json_soup = soup.find( "script", {"type": "application/ld+json"}, text=category_json_pattern) if category_json_soup: parse_category(category_json_soup, listing_obj) # Parse address address_json_pattern = re.compile(r"LocalBusiness") address_json_soup = soup.find( "script", {"type": "application/ld+json"}, text=address_json_pattern) if address_json_soup: parse_address(address_json_soup, listing_obj) # Price details financials_span_pattern = re.compile(r"Asking Price:") financials_span_soup = soup.find( "span", text=financials_span_pattern) if financials_span_soup: financials_soup = financials_span_soup.parent.parent.parent.parent financials_dict = parse_financials_div( financials_soup, listing_obj) listing_obj.financials = financials_dict else: print( f"Financials not present #{index} of {length} {listing_obj.url}") print(soup) # Listing Details details_soup = soup.find("dl", {"class": "listingProfile_details"}) if details_soup: details_dict = parse_details_div(details_soup, listing_obj) listing_obj.details = details_dict except Exception as e: print(f"error {e}") def run_listing_calculations(listing_obj): # All in price extra_costs = 0 price = listing_obj.financials["Asking Price"] for item in listing_obj.financials["notIncluded"]: if "Real Estate" not in item: extra_costs += listing_obj.financials[item] if isinstance(price, int): all_in_price = listing_obj.financials["Asking Price"] + extra_costs else: all_in_price = listing_obj.financials["Asking Price"] listing_obj.financials["allInPrice"] = all_in_price # Multiple all_in_price = listing_obj.financials["allInPrice"] cashflow = listing_obj.financials["Cash Flow"] try: listing_obj.financials["Multiple"] = all_in_price / cashflow except Exception: listing_obj.financials["Multiple"] = "N/A" def parse_listings_from_pkl(): with open("/Users/work/Dropbox/Projects/Working Data/bizbuysell/listings20191231.pkl", "rb") as infile: listing_objs = pickle.load(infile) print("Validate listing responses") listing_resp_validated = [] for listing_obj in progressbar(listing_objs): try: if "Soup test failed" not in listing_obj.response_text: listing_resp_validated.append(listing_obj) except Exception: continue parse_listings(listing_resp_validated) print("Perform listing calculations") for listing_obj in progressbar(listing_resp_validated): financials_present = hasattr(listing_obj, "financials") if financials_present: run_listing_calculations(listing_obj) pdb.set_trace() if __name__ == "__main__": parse_listings_from_pkl()
Here's a link to the .pkl
file needed to run this.
Here's a gist with the example HTML response and product_json_soup
.
product_json_soup
is undefined in your code. Add more context and post a testablecategory_json_soup
content\$\endgroup\$cProfile
for the larger program to the gist. Let me know if that helps. I could put in the entire function, but it's more or less just tweaks on this, and it seems there
operations take the longest.\$\endgroup\$