Parse JSON from HTML Python

Question

I'm parsing out specific values on web pages with BeautifulSoup. However, since I'm using RegEx, my program is taking forever to run. Would love ideas on how to speed this up.

from bs4 import BeautifulSoup import datetime import json from progressbar import progressbar import pdb import pickle import re class Listing(): def __init__(self, custom_name, **entries): self.__dict__.update(entries) self.custom_name = custom_name self.date_accessed = datetime.datetime.today() def __hash__(self): return hash(self.custom_name) def __eq__(self, other): return self.custom_name == other.custom_name def __repr__(self): return self.custom_name def list_to_dict(rlist): # QUEST: There are multiple colons in many of the entries. I couldn't # figure out how to use re.split where it only split the first occurence # so instead I replace only the first occurence and then split that new str list_with_replace_str = [re.sub(":", ":REPLACE", e, 1) for e in rlist] temp_dict = dict(f.split(":REPLACE") for f in list_with_replace_str) clean_dict = {} for key in temp_dict.keys(): clean_key = key.strip() clean_value = temp_dict[key].strip() clean_dict[clean_key] = clean_value return clean_dict def parse_listings(listing_objs): def parse_financials_div(financials_soup, listing_obj): try: financials_text = financials_soup.text financials_list = financials_text.split("\r\n")[:-1] financials_dict = list_to_dict(financials_list) not_included = [] for key in financials_dict: if "*" in financials_dict[key]: not_included.append(key) financials_dict["notIncluded"] = not_included for key in financials_dict: try: financials_dict[key] = int( re.sub("[^0-9]", "", financials_dict[key])) except Exception: continue return financials_dict except Exception as e: print(f"error {e}") pdb.set_trace() def parse_details_div(details_soup, listing_obj): try: details_tag_list = details_soup.contents details_str = " ".join([str(element) for element in details_tag_list]) details_list = details_str.split("<dt>")[1:] strs_to_tags = [BeautifulSoup(detail, "html.parser") for detail in details_list] details_text = [tag.text for tag in strs_to_tags] details_dict = list_to_dict(details_text) return details_dict except Exception as e: print(f"error {e}") pdb.set_trace() def parse_category(product_json_soup, listing_obj): product_json_str = product_json_soup.contents[0].replace( "\r", "").replace("\n", "") product_json_str = product_json_str.replace( "\'", "").replace('\\"', '').replace("\t", "") product_dict = json.loads(product_json_str) category_str = product_dict["category"] category_list = category_str.split(">") category_list = [category.strip() for category in category_list] listing_obj.category = {} listing_obj.category["parent_category"] = category_list[0] try: listing_obj.category["sub_category"] = category_list[1] except Exception: listing_obj.category["sub_category"] = "Not Present" def parse_address(address_json_soup, listing_obj): address_json_str = address_json_soup.contents[0].replace( "\r", "").replace("\n", "") address_json_str = address_json_str.replace( "\'", "").replace('\\"', '').replace("\t", "") address_dict = json.loads(address_json_str) listing_obj.address = address_dict["address"] # Parse available listing fields into a dict print("Parse financials and details for listings") for listing_obj in progressbar(listing_objs): try: index = listing_objs.index(listing_obj) length = len(listing_objs) soup = BeautifulSoup(listing_obj.response_text, "html.parser") # Parse category category_json_pattern = re.compile(r"\"@type\" : \"Product\"") category_json_soup = soup.find( "script", {"type": "application/ld+json"}, text=category_json_pattern) if category_json_soup: parse_category(category_json_soup, listing_obj) # Parse address address_json_pattern = re.compile(r"LocalBusiness") address_json_soup = soup.find( "script", {"type": "application/ld+json"}, text=address_json_pattern) if address_json_soup: parse_address(address_json_soup, listing_obj) # Price details financials_span_pattern = re.compile(r"Asking Price:") financials_span_soup = soup.find( "span", text=financials_span_pattern) if financials_span_soup: financials_soup = financials_span_soup.parent.parent.parent.parent financials_dict = parse_financials_div( financials_soup, listing_obj) listing_obj.financials = financials_dict else: print( f"Financials not present #{index} of {length} {listing_obj.url}") print(soup) # Listing Details details_soup = soup.find("dl", {"class": "listingProfile_details"}) if details_soup: details_dict = parse_details_div(details_soup, listing_obj) listing_obj.details = details_dict except Exception as e: print(f"error {e}") def run_listing_calculations(listing_obj): # All in price extra_costs = 0 price = listing_obj.financials["Asking Price"] for item in listing_obj.financials["notIncluded"]: if "Real Estate" not in item: extra_costs += listing_obj.financials[item] if isinstance(price, int): all_in_price = listing_obj.financials["Asking Price"] + extra_costs else: all_in_price = listing_obj.financials["Asking Price"] listing_obj.financials["allInPrice"] = all_in_price # Multiple all_in_price = listing_obj.financials["allInPrice"] cashflow = listing_obj.financials["Cash Flow"] try: listing_obj.financials["Multiple"] = all_in_price / cashflow except Exception: listing_obj.financials["Multiple"] = "N/A" def parse_listings_from_pkl(): with open("/Users/work/Dropbox/Projects/Working Data/bizbuysell/listings20191231.pkl", "rb") as infile: listing_objs = pickle.load(infile) print("Validate listing responses") listing_resp_validated = [] for listing_obj in progressbar(listing_objs): try: if "Soup test failed" not in listing_obj.response_text: listing_resp_validated.append(listing_obj) except Exception: continue parse_listings(listing_resp_validated) print("Perform listing calculations") for listing_obj in progressbar(listing_resp_validated): financials_present = hasattr(listing_obj, "financials") if financials_present: run_listing_calculations(listing_obj) pdb.set_trace() if __name__ == "__main__": parse_listings_from_pkl()

Here's a link to the .pkl file needed to run this.

Here's a gist with the example HTML response and product_json_soup.

product_json_soup is undefined in your code. Add more context and post a testable category_json_soup content — RomanPerekhrest, CommentedDec 31, 2019 at 17:58
Cleaned up the naming and added a gist with the responses to make it testable. — Lance Johnson, CommentedJan 1, 2020 at 18:16
"program is taking forever to run" - there should be other bottlenecks on your side, the above fragment takes about 1 second to run on my machine. Does your actual script implies some looping and more extended parsing? — RomanPerekhrest, CommentedJan 1, 2020 at 19:57
You're right @RomanPerekhrest. It's fast for one, but when I iterate over 40,000 it is a very slow step. I've added the cProfile for the larger program to the gist. Let me know if that helps. I could put in the entire function, but it's more or less just tweaks on this, and it seems the re operations take the longest. — Lance Johnson, CommentedJan 1, 2020 at 22:22
Can you share those 40,000 urls so I could test the loop and get the actual estimates? — RomanPerekhrest, CommentedJan 2, 2020 at 10:25

JosefZ · Accepted Answer · 2020-01-09 14:52:06Z

The most time is consumed by BeautifulSoup conversions, namely

soup = BeautifulSoup(listing_obj.response_text, "html.parser")

For proof, firstly create a .pkl file of a reasonable size for debugging:

if __name__ == "__main__": with open("D:\\Downloads\\listings20191231.pkl", "rb") as infile: listing_objs = pickle.load(infile) data = listing_objs[222:666] with open("D:\\Python\\CR\\listings20191231.pkl", "wb") as oufile: pickle.dump(data, oufile, pickle.HIGHEST_PROTOCOL)

Then, check and compare consumed time using following adapted code (moreover, I removed all the progressbar stuff from the rest of original code):

if __name__ == "__main__": import time import sys argcnt = len(sys.argv) - 1 argtxt = 'parse_listings_from_pkl()' if argcnt == 0 else 'BeautifulSoup' startload = time.time() with open("D:\\Python\\CR\\listings20191231.pkl", "rb") as infile: listing_objs = pickle.load(infile) length = len(listing_objs) print( 'checking time: ', argtxt, length, 'records') start0 = time.time() if argcnt == 0: parse_listings_from_pkl() else: for listing_obj in listing_objs: #progressbar(listing_objs): soap = BeautifulSoup(listing_obj.response_text, "html.parser") start1 = time.time() print("time consumed: ", argtxt, start1 - start0)

Output shows that cca 86 % of time (100 * 32.761232137680054 / 38.00445818901062) is consumed by converting original html to BeautifulSoup format:

D:\Python\CR\234876.py

checking time: parse_listings_from_pkl() 444 records Validate listing responses Parse financials and details for listings Perform listing calculations time consumed: parse_listings_from_pkl() 38.00445818901062

D:\Python\CR\234876.py 1

checking time: BeautifulSoup 444 records time consumed: BeautifulSoup 32.761232137680054

Although there are some optimizable parts in the rest of pure python code (and I tried them with only minor performance improvements), I found that the BeautifulSoup conversion time corresponds to original html size and there is most of gubbins of no use inside the analyzed html.

Hence, I'd try cutting the listing_obj.response_text into pieces of useful parts and convert merely those parts to <class 'bs4.BeautifulSoup'> type. Maybe Speeding up beautifulsoup or Simple HTML and XHTML parser could help extracting useful info from the original html?

Stack Exchange Network

Parse JSON from HTML Python

1 Answer 1

Hot Network Questions

Parse JSON from HTML Python

1 Answer 1

Related

Hot Network Questions