I have collected large dataset of trajectories in 3D space. Below is the information about dataset-
- Each trajectory (shape: n x 4) is saved into CSV file with the following header:
time, p_x, p_y, p_z
- The filenames are defined in following way: subject_
i
_trail_i
.csv, wherei
starts from 1 to 10.
Below are first 10 lines from subject_1_trail_1.csv
file-
time p_x p_y p_z 0 0.4333 0.1107 0.1259 0.0103 0.4336 0.1106 0.126 0.02 0.4334 0.1108 0.1259 0.03 0.4333 0.1106 0.1259 0.04 0.4334 0.1107 0.1259 0.0501 0.4328 0.1103 0.126 0.06 0.4331 0.1107 0.1255 0.0703 0.4331 0.1103 0.126 0.08 0.4324 0.1102 0.126
For each subject, I want to plot the trajectory showing median and variance as shown below-
fill_between(x, perc_25, perc_75)
I am using NumPy but the code is relatively long. Below is the code snippet-
from pylab import * import sys # make a 3d numpy array (chop off the extra rows) def load(location, subject, suffix): all_data = [] min_rows = sys.maxint for i in range(1, 11): file_name = location + subject + '_trail_' + str(i) + '.csv' data = np.loadtxt(file_name, delimiter=',', skiprows=1) if data.shape[0] < min_rows: min_rows = data.shape[0] all_data.append(data) all_crop_data = [] for data in all_data: all_crop_data.append(data[:min_rows,:]) return np.array(all_crop_data) def proc(data): t = data[:, :, 0] # first column is time x = data[:, :, 1] # second column is x y = data[:, :, 2] # third column is y z = data[:, :, 3] # fourth column is z t_median = np.zeros(data.shape[1]) t_perc_25 = np.zeros(data.shape[1]) t_perc_75 = np.zeros(data.shape[1]) x_median = np.zeros(data.shape[1]) x_perc_25 = np.zeros(data.shape[1]) x_perc_75 = np.zeros(data.shape[1]) y_median = np.zeros(data.shape[1]) y_perc_25 = np.zeros(data.shape[1]) y_perc_75 = np.zeros(data.shape[1]) z_median = np.zeros(data.shape[1]) z_perc_25 = np.zeros(data.shape[1]) z_perc_75 = np.zeros(data.shape[1]) for i in range(data.shape[1]): # for each timestamp t_median[i] = np.median(t[:, i]) t_perc_25[i] = np.percentile(t[:, i], 25) t_perc_75[i] = np.percentile(t[:, i], 75) x_median[i] = np.median(x[:, i]) x_perc_25[i] = np.percentile(x[:, i], 25) x_perc_75[i] = np.percentile(x[:, i], 75) y_median[i] = np.median(y[:, i]) y_perc_25[i] = np.percentile(y[:, i], 25) y_perc_75[i] = np.percentile(y[:, i], 75) z_median[i] = np.median(z[:, i]) z_perc_25[i] = np.percentile(z[:, i], 25) z_perc_75[i] = np.percentile(z[:, i], 75) all_median = np.vstack((t_median, x_median, y_median, z_median)).T all_perc_25 = np.vstack((t_perc_25, x_perc_25, y_perc_25, z_perc_25)).T all_perc_75 = np.vstack((t_perc_75, x_perc_75, y_perc_75, z_perc_75)).T return all_median, all_perc_25, all_perc_75 s1 = load('/Desktop/', 'subject_1') # subject 1 s2 = load('/Desktop/', 'subject_2') # subject 2 x = np.arange(0, s1.shape[1]) s1_med, s1_perc_25, s1_perc_75 = proc(s1) s2_med, s2_perc_25, s2_perc_75 = proc(s2) # lets plot only x (second column) index = 1 s1_med = s1_med[:, index] s1_perc_25 = s1_perc_25[:, index] s1_perc_75 = s1_perc_75[:, index] s2_med = s2_med[:, index] s2_perc_25 = s2_perc_25[:, index] s2_perc_75 = s2_perc_75[:, index] fill_between(x, s1_perc_25, s1_perc_75, alpha=0.25, linewidth=0, color='#B22400') fill_between(x, s2_perc_25, s2_perc_75, alpha=0.25, linewidth=0, color='#006BB2') plot(x, s1_med, linewidth=2, color='#B22400') plot(x, s2_med, linewidth=2, color='#006BB2')
I am looking for a better, i.e., pythonic way to achieve the same. I am not sure if Pandas can be useful here. Looking for your suggestions, please.