I implemented the following code to retrieve medication names from a given text (prescription_text
in the code). It works by matching words in the text with a list of existing medications (US_DRUGS
in the code).
The code is working but it's very slow since the medication list that I use is pretty big (150.000 items). I need it to be faster.
The key part of this code is obviously this one:
for DRUG in US_DRUGS: for match in regex.finditer(re.escape(DRUG.name), re.escape(normalized_prescription_text), re.IGNORECASE): matched_drugs.append(Drug(DRUG.name, DRUG.atc, match.span()))
Can someone take a look at my code and maybe show me some improvements I could make ?
The code:
import unidecode class Drug: def __init__(self, name, atc, position): # normalize name make it capitalize self.name = name.upper() self.atc = atc self.start = position[0] if position is not None else None self.stop = position[1] if position is not None else None def __eq__(self, other): return self.name == other.name and self.atc == other.atc def __hash__(self): return hash(( 'name', self.name, 'atc', self.atc )) US_DRUGS = [ Drug("MED1", 1, None), Drug("MED2", 2, None), Drug("MED3", 3, None), Drug("MED4", 4, None), Drug("MED5", 5, None), Drug("MED6", 6, None) # imagine this list way bigger (around 150.000 items) ] def _extract_drugs_from_prescription_text(prescription_text): # normalize prescription text (remove accents) normalized_prescription_text = unidecode.unidecode(prescription_text) # remove non word character normalized_prescription_text = re.sub(r'\W+', ' ', normalized_prescription_text) # For every occurrence of a drug's name in the prescription text # it will append a Drug() object with match's details in a list matched_drugs = [] for DRUG in US_DRUGS: for match in regex.finditer(re.escape(DRUG.name), re.escape(normalized_prescription_text), re.IGNORECASE): matched_drugs.append(Drug(DRUG.name, DRUG.atc, match.span())) # Will clean up the matches list from duplicates substring # ex: 'DOLIPRANE' and 'DOLIPRANE CODEINE' # if they start at the same point, first one is removed matched_drugs_without_substring = [] for match in matched_drugs: if [m for m in matched_drugs if m.start <= match.start <= m.stop and len(match.name) < len(m.name)]: pass else: matched_drugs_without_substring.append(match) # remove duplicates return list(set(matched_drugs_without_substring)) if __main__ == "__main__": prescription_text = "- TEST - some example text here with some medication names like MED1, MED2, MED3. End of the test #$%^" _extract_drugs_from_prescription_text(prescription_text)
undefined name '__main__'
,undefined name 're'
,undefined name 'regex'
\$\endgroup\$