cloudFPGA (cF) API  1.0
The documentation of the source code of cloudFPGA (cF)
mngmt.py
Go to the documentation of this file.
1 # /*******************************************************************************
2 # * Copyright 2016 -- 2022 IBM Corporation
3 # *
4 # * Licensed under the Apache License, Version 2.0 (the "License");
5 # * you may not use this file except in compliance with the License.
6 # * You may obtain a copy of the License at
7 # *
8 # * http://www.apache.org/licenses/LICENSE-2.0
9 # *
10 # * Unless required by applicable law or agreed to in writing, software
11 # * distributed under the License is distributed on an "AS IS" BASIS,
12 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # * See the License for the specific language governing permissions and
14 # * limitations under the License.
15 # *******************************************************************************/
16 
17 # *
18 # * cloudFPGA
19 # * Copyright IBM Research, All Rights Reserved
20 # * =============================================
21 # * Created: Mar. 2021
22 # * Authors: FAB, WEI, NGL, DID
23 # *
24 # * Description:
25 # * A Python library with functions for accessing cFRM and create/delete images, instances, clusters etc.
26 # *
27 # *
28 
29 import os
30 import sys
31 import requests
32 from requests.utils import requote_uri
33 import json
34 import time
35 import datetime
36 
37 from cfsp_user import *
38 from cfsp_globals import *
39 
40 from cfsp_util import errorReqExit
41 
42 
43 
44 
45 
48 
49 
50 class cFcluster:
51  """Representation of a specific cloudFPGA cluster"""
52 
53  def __init__(self, user: cFuser, cluster_data):
54  self.useruser = user
55  self.cluster_datacluster_data = cluster_data
56  self.idid = cluster_data['cluster_id']
57 
58  def get_id(self):
59  return self.idid
60 
61 
62 def post_cluster(user: cFuser, number_of_FPGA_nodes, role_image_id, host_address):
63  # build cluster_req structure
64  print("Creating FPGA cluster...")
65  try:
66  start = time.time()
67  sw_rank = 0
68  cluster_req = []
69  rank0node = {'image_id': __NON_FPGA_IDENTIFIER__,
70  'node_id': sw_rank,
71  'node_ip': host_address}
72  cluster_req.append(rank0node)
73  size = number_of_FPGA_nodes + 1
74  for i in range(1, size):
75  fpgaNode = {
76  'image_id': str(role_image_id),
77  'node_id': i
78  }
79  cluster_req.append(fpgaNode)
80 
81  r1 = requests.post(
82  "http://" + __cf_manager_url__ + "/clusters?{0}&dont_verify_memory=0".format(
83  user.get_auth_string(with_project=True)),
84  json=cluster_req, timeout=__POST_CLUSTER_TIMEOUT__)
85  elapsed = time.time() - start
86 
87  if r1.status_code != 200:
88  # something went wrong
89  return errorReqExit("POST cluster", r1.status_code)
90 
91  cluster_data = json.loads(r1.text)
92  print("Id of new cluster: {}".format(cluster_data['cluster_id']))
93  print("Time for POST cluster: \t{0}s\n".format(elapsed))
94  new_cluster = cFcluster(user, cluster_data)
95  return new_cluster
96  except requests.exceptions.Timeout as e:
97  # Maybe set up for a retry
98  print(e)
99  print("ERROR: Something went wrong with post_cluster request and it reached timeout="+str(__POST_CLUSTER_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
100  sys.exit(1)
101 
102 
103 def get_cluster_data(cluster: cFcluster):
104  print("Requesting cluster data for cluster_id={0} ...".format(cluster.get_id())) # FIXME: .get_id()
105  try:
106  start = time.time()
107  r1 = requests.get(
108  "http://" + __cf_manager_url__ + "/clusters/" + str(cluster.get_id()) + "?{0}".format(
109  cluster.user.get_auth_string()), timeout=__GET_CLUSTER_TIMEOUT__)
110  elapsed = time.time() - start
111  print("Time for GET cluster: \t{0}s\n".format(elapsed))
112  if r1.status_code != 200:
113  # something went horrible wrong
114  return errorReqExit("GET cluster", r1.status_code)
115 
116  cluster_data = json.loads(r1.text)
117  # update, if necessary
118  cluster.cluster_data = cluster_data
119  return cluster_data
120  except requests.exceptions.Timeout as e:
121  # Maybe set up for a retry
122  print(e)
123  print("ERROR: Something went wrong with get_cluster_data request and it reached timeout="+str(__GET_CLUSTER_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
124 
125 
126 def get_clusters_data(user: cFuser, limit=100):
127  print("Requesting clusters data (limit="+str(limit)+")...")
128  try:
129  start = time.time()
130  r1 = requests.get(
131  "http://" + __cf_manager_url__ + "/clusters" + "?{0}&limit={1}".format(
132  user.get_auth_string(), limit), timeout=__GET_CLUSTER_TIMEOUT__)
133  elapsed = time.time() - start
134  print(r1.request.url)
135  print("Time for GET clusters: \t{0}s\n".format(elapsed))
136  if r1.status_code != 200:
137  # something went horrible wrong
138  return errorReqExit("GET clusters", r1.status_code)
139  clusters_data = json.loads(r1.text)
140  return clusters_data
141  except requests.exceptions.Timeout as e:
142  # Maybe set up for a retry
143  print(e)
144  print("ERROR: Something went wrong with get_cluster request and it reached timeout="+str(__GET_CLUSTER_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
145 
146 
147 def delete_cluster_data(cluster: cFcluster):
148  print("Requesting delete cluster_id={0} ...".format(cluster.get_id()))
149  try:
150  start = time.time()
151  r1 = requests.delete(
152  "http://" + __cf_manager_url__ + "/clusters/" + str(cluster.get_id()) + "?{0}".format(
153  cluster.user.get_auth_string()), timeout=__DELETE_CLUSTER_TIMEOUT__)
154  elapsed = time.time() - start
155  print("Time for DELETE cluster: \t{0}s\n".format(elapsed))
156  if r1.status_code != 204:
157  # something went horrible wrong
158  return errorReqExit("DELETE cluster", r1.status_code)
159  return 0
160  except requests.exceptions.Timeout as e:
161  # Maybe set up for a retry
162  print(e)
163  print("ERROR: Something went wrong with delete_cluster_data request and it reached timeout="+str(__DELETE_CLUSTER_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
164 
165 
166 def restart_cluster_apps(cluster: cFcluster):
167  print("Requesting restart for (all) FPGA(s) of cluster_id={0} ...".format(cluster.get_id()))
168  try:
169  start = time.time()
170  r1 = requests.patch(
171  "http://" + __cf_manager_url__ + "/clusters/" + str(cluster.get_id()) + "/restart?{0}".format(
172  cluster.user.get_auth_string()))
173  elapsed = time.time() - start
174  print("Time for RESTART cluster: \t{0}s\n".format(elapsed))
175 
176  if r1.status_code != 200:
177  # something went horrible wrong
178  return errorReqExit("PATCH cluster restart", r1.status_code)
179  print(r1.content.decode())
180 
181  except Exception as e:
182  print("ERROR: Failed to reset the FPGA(s) role(s)")
183  print(str(e))
184  exit(1)
185 
186 
189 
190 
192 
193  def __init__(self, user: cFuser, instance_data):
194  self.instance_datainstance_data = instance_data
195  self.useruser = user
196  self.idid = instance_data['instance_id']
197 
198  def get_id(self):
199  return self.idid
200 
201 def get_instances_data(user: cFuser, limit=100):
202  print("Requesting instances data (limit="+str(limit)+")...")
203  try:
204  start = time.time()
205  r1 = requests.get(
206  "http://" + __cf_manager_url__ + "/instances" + "?{0}&limit={1}".format(
207  user.get_auth_string(), limit), timeout=__GET_INSTANCE_TIMEOUT__)
208  elapsed = time.time() - start
209  print(r1.request.url)
210  print("Time for GET instances: \t{0}s\n".format(elapsed))
211  if r1.status_code != 200:
212  # something went horrible wrong
213  return errorReqExit("GET instances", r1.status_code)
214  instances_data = json.loads(r1.text)
215  return instances_data
216  except requests.exceptions.Timeout as e:
217  # Maybe set up for a retry
218  print(e)
219  print("ERROR: Something went wrong with get_instances request and it reached timeout="+str(__GET_INSTANCE_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
220 
222  print("TODO \n")
223 
224 
225 def get_instance_data(instance: cFinstance):
226  print("Requesting instance data for instance_id={0} ...".format(instance.get_id()))
227  try:
228  start = time.time()
229  r1 = requests.get(
230  "http://" + __cf_manager_url__ + "/instances/" + str(instance.get_id()) + "?{0}".format(
231  instance.user.get_auth_string()), timeout=__GET_INSTANCE_TIMEOUT__)
232  elapsed = time.time() - start
233  print("Time for GET instance: \t{0}s\n".format(elapsed))
234  if r1.status_code != 200:
235  # something went horrible wrong
236  return errorReqExit("GET instance", r1.status_code)
237 
238  instance_data = json.loads(r1.text)
239  # update, if necessary
240  instance.instance_data = instance_data
241  return instance_data
242  except requests.exceptions.Timeout as e:
243  # Maybe set up for a retry
244  print(e)
245  print("ERROR: Something went wrong with get_instance_data request and it reached timeout="+str(__GET_INSTANCE_TIMEOUT__)+". Maybe retry or increase timeout value.\n")
246 
247 
248 
250  print("TODO \n")
251 
252 
254  print("TODO \n")
255 
256 
257 def restart_instance_app(instance: cFinstance):
258  print("Requesting restart for instance_id={0} ...".format(instance.get_id()))
259  try:
260  start = time.time()
261  r1 = requests.patch(
262  "http://" + __cf_manager_url__ + "/instances/" + str(instance.get_id()) + "/app_restart?{0}".format(
263  instance.user.get_auth_string()))
264  elapsed = time.time() - start
265  print("Time for RESTART instance: \t{0}s\n".format(elapsed))
266 
267  if r1.status_code != 200:
268  # something went horrible wrong
269  return errorReqExit("PATCH instance restart", r1.status_code)
270  print(r1.content.decode())
271 
272  except Exception as e:
273  print("ERROR: Failed to reset the FPGA role")
274  print(str(e))
275  exit(1)
276 
277 def delete_instance(instance: cFinstance):
278  print("deleting instance {}".format(instance.id))
279 
280  r1 = requests.delete(
281  "http://" + __cf_manager_url__ + "/instances/{0}?{1}".format(instance.id, instance.user.get_auth_string()))
282 
283  if r1.status_code > 204:
284  # error codes
285  # 204 Instance was deleted
286  # 401 Unauthenticated, bad login
287  # 403 Unauthorized
288  # 404 Instance does not exist
289  return r1.status_code
290  else:
291  print("Instance {} removed".format(instance.id))
292 
293  instance_data = r1.status_code
294  return instance_data
295 
296 
297 
300 
301 class cFimage:
302 
303  def __init__(self, user: cFuser, image_data):
304  self.useruser = user
305  self.image_dataimage_data = image_data
306  self.idid = image_data['id']
307  self.commentcomment = image_data['comment']
308  self.required_shellrequired_shell = image_data['shell_type']
309 
310 
311 def get_images(user: cFuser):
312  print("TODO \n")
313 
314 
315 def get_image(image: cFimage):
316  print("TODO \n")
317 
318 
319 def post_image(image: cFimage):
320  print("TODO \n")
321 
322 
323 def delete_image(image: cFimage):
324  print("TODO \n")
325 
326 
327 
330 
331 # TODO: no resource class so far, because the resource data should reside inside the CFRM, not the cFSP
332 
333 def get_resource_status(resource_id, admin: cFuser):
334  print("Requesting resource status...")
335  r1 = requests.get(
336  "http://" + __cf_manager_url__ + "/resources/" + str(
337  resource_id) + "/status/" + "?{0}".format(admin.get_auth_string()))
338 
339  if r1.status_code != 200:
340  # something went wrong
341  return errorReqExit("GET resource status", r1.status_code)
342 
343  resource_status = json.loads(r1.text)
344  return resource_status
345 
346 
347 def set_resource_status(resource_id, new_status, admin: cFuser):
348  # print("set resource status")
349 
350  # possible status: "AVAILABLE", "USED", "MAINTENANCE"
351  r1 = requests.put(
352  "http://" + __cf_manager_url__ + "/resources/{0}/status/?{1}&new_status={2}".format(
353  resource_id, admin.get_auth_string(), new_status))
354 
355  if r1.status_code != 204:
356  # something went wrong
357  return errorReqExit("PUT /resources/{resource_id}/status/", r1.status_code)
358  else:
359  print("Resource {} set to {}".format(resource_id, new_status))
360 
361  resource_data = r1.status_code
362  return resource_data
363 
364 
Clusters functions.
Definition: mngmt.py:50
def __init__(self, cFuser user, cluster_data)
Definition: mngmt.py:53
def get_id(self)
Definition: mngmt.py:58
Images functions.
Definition: mngmt.py:301
def __init__(self, cFuser user, image_data)
Definition: mngmt.py:303
Instances functions.
Definition: mngmt.py:191
def __init__(self, cFuser user, instance_data)
Definition: mngmt.py:193
def errorReqExit(msg, code)
Definition: cfsp_util.py:63
def api_request_instance()
Definition: mngmt.py:253
def get_cluster_data(cFcluster cluster)
Definition: mngmt.py:103
def get_instance_data(cFinstance instance)
Definition: mngmt.py:225
def delete_instance(cFinstance instance)
Definition: mngmt.py:277
def get_instances_data(cFuser user, limit=100)
Definition: mngmt.py:201
def post_cluster(cFuser user, number_of_FPGA_nodes, role_image_id, host_address)
Definition: mngmt.py:62
def set_resource_status(resource_id, new_status, cFuser admin)
Definition: mngmt.py:347
def get_image(cFimage image)
Definition: mngmt.py:315
def create_instance()
Definition: mngmt.py:221
def delete_cluster_data(cFcluster cluster)
Definition: mngmt.py:147
def get_images(cFuser user)
Definition: mngmt.py:311
def restart_instance_app(cFinstance instance)
Definition: mngmt.py:257
def get_resource_status(resource_id, cFuser admin)
Resources functions (admin only)
Definition: mngmt.py:333
def restart_cluster_apps(cFcluster cluster)
Definition: mngmt.py:166
def delete_image(cFimage image)
Definition: mngmt.py:323
def reprogram_instance()
Definition: mngmt.py:249
def get_clusters_data(cFuser user, limit=100)
Definition: mngmt.py:126
def post_image(cFimage image)
Definition: mngmt.py:319