I am given a CSV file of stations, with data like this:
station_id,date,temperature_c 68,2000.375,10.500 68,2000.542,5.400 68,2000.958,23.000 68,2001.125,20.400 68,2001.292,13.300 68,2001.375,10.400 68,2001.958,21.800 68,2002.208,15.500
and so on for many different station_id
s.
Then I want to create a Python program that (1) gives the minimum reading (the third column) and (2) the station with the maximum "travel distance" with its readings. Thus if a station has 3 readings of -5,0,8 then that would mean a travel distance of 13. This can take an optional date range. Here is what I did.
#!/usr/bin/python from collections import defaultdict import csv import random import sys # In order to track each station's statistics, we'll create a Station class # to hold the data on a per-station basis. class Station: def __init__(self): self.readings = [] self.minimum = 99999999.0 self.travel = 0 # travel holds the change in temperature reading-by-reading def get_travel(self): return self.travel def set_travel(self, n): self.travel += abs(n) # getter & setter for station minimums def get_minimum(self): return self.minimum def set_minimum(self, n): self.minimum = n # infrastructure for future code expansion def get_readings(self): return self.readings def set_readings(self, date, temp): self.readings.append({ "date" : date, "temp" : temp}) """ Reporter class handles a list of Stations """ class Reporter: def __init__(self): # stations dict with entries for holding the specified stats. self.stations = defaultdict(Station) self.global_minimum = { "station" : "default", "date" : 1, "temp" : 9999999 } self.longest_travel = { "station" : "default", "range" : 0 } """ Determines which station recorded the coldest temperature args: CSV file returns: dict with data """ def minimum_temperature(self, filename): with open(filename, 'r') as datafile: try: csv_reader = csv.reader(datafile) next(csv_reader) # reading line-by-line since CSV could be a big file for row in csv_reader: station, date, temp = row # save the station's readings self.stations[station].set_readings(date, temp) temp = float(temp) if (temp < self.stations[station].get_minimum()): self.stations[station].set_minimum(temp) if(temp < self.global_minimum["temp"]): self.global_minimum = { "station" : station, "temp" : temp, "date" : date } # The specs state that in the event that a tie occurs simply return # one pair at random. if (temp == self.global_minimum["temp"]): if (random.randint(1,100) % 2 == 0): self.global_minimum = { "station" : station, "date" : date, "temp" : temp } except csv.Error as e: sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e)) return self.global_minimum """ Determines which station "traveled" the most args: CSV file, begin date (optional), end date (optional) returns: dict with data """ def max_travel(self,filename,begin=1.0,end=9999.9): with open(filename, 'r') as datafile: try: csv_reader = csv.reader(datafile) next(csv_reader) # reading line-by-line since CSV could be a big file for row in csv_reader: station, date, temp = row # save for future expansion self.stations[station].set_readings(date, temp) date = float(date) if date > begin and date < end: temp = float(temp) self.stations[station].set_travel(temp) travel = self.stations[station].get_travel() if ( travel > self.longest_travel["range"]): self.longest_travel = { "station" : station, "range" : travel } except csv.Error as e: sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e)) return self.longest_travel if __name__ == "__main__": csv_file = sys.argv[1] # fetch lowest temperature reporter = Reporter() global_minimum = reporter.minimum_temperature(csv_file) print("station {} had the global minimum on {}".format(global_minimum["station"], global_minimum["date"])) # fetch maximum travel overall longest_travel = reporter.max_travel(csv_file) print("station {} had the greatest travel at {}".format(longest_travel["station"], longest_travel["range"])) # now try a date range reporter2 = Reporter() begin = 2001.0 end = 2006.0 longest_travel = reporter2.max_travel(csv_file,begin,end) print("for {} to {}, station {} had the greatest travel at {}".format(begin, end, longest_travel["station"], longest_travel["range"]))
I'm particularly interested in speeding it up and memory usage but also how to Pythonically deal with with getters/setters.