skypilot/examples/time_estimators.py at master · skypilot-org/skypilot · GitHub

Name: skypilot/examples/time_estimators.py at master · skypilot-org/skypilot · GitHub
Rating: 4.9 (853 reviews)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
importsky
fromskyimportsky_logging

logger=sky_logging.init_logger(__name__)


defresnet50_estimate_runtime(resources):
"""A simple runtime model for Resnet50."""
# 3.8 G Multiply-Adds, 2 FLOPs per MADD, 3 for fwd+bwd.
flops_for_one_image=3.8* (10**9) *2*3

def_v100(num_v100s):
# Adds communication overheads per step (in seconds).
communication_slack=0.0
ifnum_v100s==4:
communication_slack=0.15
elifnum_v100s==8:
communication_slack=0.30

max_per_device_batch_size=256
effective_batch_size=max_per_device_batch_size*num_v100s

# 112590 steps, 1024 BS = 90 epochs.
total_steps=112590* (1024.0/effective_batch_size)
flops_for_one_batch=flops_for_one_image*max_per_device_batch_size

# 27 TFLOPs, harmonic mean b/t 15TFLOPs (single-precision) & 120 TFLOPs
# (16 bit).
utilized_flops=27* (10**12)

# print('****** trying 1/3 util for v100')
utilized_flops=120* (10**12) /3

estimated_step_time_seconds=flops_for_one_batch/utilized_flops \
+communication_slack
estimated_run_time_seconds=estimated_step_time_seconds*total_steps
returnestimated_run_time_seconds

ifisinstance(resources.cloud, sky.AWS):
instance=resources.instance_type
ifinstance=='p3.2xlarge':
num_v100s=1
elifinstance=='p3.8xlarge':
num_v100s=4
elifinstance=='p3.16xlarge':
num_v100s=8
else:
assertFalse, 'Not supported: {}'.format(resources)
return_v100(num_v100s)

elifisinstance(resources.cloud, sky.GCP):
accelerators=resources.accelerators
ifacceleratorsisNone:
assertFalse, 'not supported'

assertlen(accelerators) ==1, resources
foracc, acc_countinaccelerators.items():
break
ifacc=='V100':
assertacc_countin [1, 2, 4, 8], resources
return_v100(acc_count)

assertacc=='tpu-v3-8', resources
tpu_v3_8_flops=420* (10**12)
known_resnet50_utilization=0.445# From actual profiling.

# GPU - fixed to 1/3 util
# TPU
# - 1/4 util: doesn't work
# - 1/3 util: works
# - 1/2 util: works

# print('*** trying hand written util for TPU')
known_resnet50_utilization=1/3

max_per_device_batch_size=1024
total_steps=112590# 112590 steps, 1024 BS = 90 epochs.
flops_for_one_batch=flops_for_one_image*max_per_device_batch_size
utilized_flops=tpu_v3_8_flops*known_resnet50_utilization
estimated_step_time_seconds=flops_for_one_batch/utilized_flops
estimated_run_time_seconds=estimated_step_time_seconds*total_steps
logger.debug(' tpu-v3-8 estimated_step_time_seconds %f',
estimated_step_time_seconds)
returnestimated_run_time_seconds

else:
assertFalse, 'not supported cloud in prototype: {}'.format(
resources.cloud)


defresnet50_infer_estimate_runtime(resources):
# 3.8 G Multiply-Adds, 2 FLOPs per MADD.
flops_for_one_image=3.8* (10**9) *2
num_images=0.1*1e6# TODO: vary this.
num_images=1e6# TODO: vary this.
num_images=70*1e6# TODO: vary this.

instance=resources.instance_type
# assert instance in ['p3.2xlarge', 'inf1.2xlarge', 'nvidia-t4'], instance

ifinstance=='p3.2xlarge':
# 120 TFLOPS TensorCore.
logger.debug('****** trying 1/3 util for v100')
utilized_flops=120* (10**12) /3

# # Max bs to keep p99 < 15ms.
# max_per_device_batch_size = 8
# max_per_device_batch_size = 8*1e3
# max_per_device_batch_size = 1

# num_v100s = 1
# effective_batch_size = max_per_device_batch_size * num_v100s

# # 112590 steps, 1024 BS = 90 epochs.
# total_steps = num_images // effective_batch_size
# flops_for_one_batch = flops_for_one_image * max_per_device_batch_size

# estimated_step_time_seconds = flops_for_one_batch / utilized_flops
# estimated_run_time_seconds = estimated_step_time_seconds * total_steps

# TODO: this ignores offline vs. online. It's a huge batch.
estimated_run_time_seconds= \
flops_for_one_image*num_images/utilized_flops
elifinstance=='inf1.2xlarge':
# Inferentia: 1 chip = 128T[F?]OPS
# Each AWS Inferentia chip supports up to 128 TOPS (trillions of
# operations per second) of performance [assume 16, as it casts to
# bfloat16 by default).
# TODO: also assume 1/3 utilization
utilized_flops=128* (10**12) /3
# TODO: this ignores offline vs. online. It's a huge batch.
estimated_run_time_seconds= \
flops_for_one_image*num_images/utilized_flops
elifresources.acceleratorsisnotNone:
accs=resources.accelerators
foracc, acc_countinaccs.items():
break
assertacc=='T4'andacc_count==1, resources
# T4 GPU: 65 TFLOPS fp16
utilized_flops=65* (10**12) /3
estimated_run_time_seconds= \
flops_for_one_image*num_images/utilized_flops
else:
assertFalse, resources

# print('** num images {} total flops {}'.format(
# num_images, flops_for_one_image * num_images))

returnestimated_run_time_seconds