3 # Copyright (c) 2019 Wind River Systems, Inc.
5 # SPDX-License-Identifier: Apache-2.0
15 DEVICE_NAME_NVME = "nvme"
22 def command(arguments, **kwargs):
23 """Execute e command and capture stdout, stderr & return code"""
24 process = subprocess.Popen(
26 stdout=subprocess.PIPE,
27 stderr=subprocess.PIPE,
29 out, err = process.communicate()
30 return out, err, process.returncode
33 def get_input(arg, valid_keys):
34 """Convert the input to a dict and perform basic validation"""
35 json_string = arg.replace("\\n", "\n")
37 input_dict = ast.literal_eval(json_string)
38 if not all(k in input_dict for k in valid_keys):
46 def get_partition_uuid(dev):
47 output, _, _ = command(['blkid', dev])
49 return re.search('PARTUUID=\"(.+?)\"', output).group(1)
50 except AttributeError:
54 def device_path_to_device_node(device_path):
56 output, _, _ = command(["udevadm", "settle", "-E", device_path])
57 out, err, retcode = command(["readlink", "-f", device_path])
59 except Exception as e:
65 ###########################################
66 # Manage Journal Disk Partitioning Scheme #
67 ###########################################
69 DISK_BY_PARTUUID = "/dev/disk/by-partuuid/"
70 JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition
73 def is_partitioning_correct(disk_path, partition_sizes):
74 """Validate the existence and size of journal partitions"""
76 # Obtain the device node from the device path.
77 disk_node = device_path_to_device_node(disk_path)
79 # Check that partition table format is GPT
80 output, _, _ = command(["udevadm", "settle", "-E", disk_node])
81 output, _, _ = command(["parted", "-s", disk_node, "print"])
82 if not re.search('Partition Table: gpt', output):
83 print("Format of disk node %s is not GPT, zapping disk" % disk_node)
86 # Check each partition size
88 for size in partition_sizes:
89 # Check that each partition size matches the one in input
90 if DEVICE_NAME_NVME in disk_node:
91 partition_node = '{}p{}'.format(disk_node, str(partition_index))
93 partition_node = '{}{}'.format(disk_node, str(partition_index))
95 output, _, _ = command(["udevadm", "settle", "-E", partition_node])
96 cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
97 output, _, _ = command(cmd)
99 regex = ("^Disk " + str(partition_node) + ":\\s*" +
100 str(size) + "[\\.0]*MiB")
101 if not re.search(regex, output, re.MULTILINE):
102 print("Journal partition %(node)s size is not %(size)s, "
103 "zapping disk" % {"node": partition_node, "size": size})
108 output, _, _ = command(["udevadm", "settle", "-t", "10"])
112 def create_partitions(disk_path, partition_sizes):
113 """Recreate partitions"""
115 # Obtain the device node from the device path.
116 disk_node = device_path_to_device_node(disk_path)
118 # Issue: After creating a new partition table on a device, Udev does not
119 # always remove old symlinks (i.e. to previous partitions on that device).
120 # Also, even if links are erased before zapping the disk, some of them will
121 # be recreated even though there is no partition to back them!
122 # Therefore, we have to remove the links AFTER we erase the partition table
123 # Issue: DISK_BY_PARTUUID directory is not present at all if there are no
124 # GPT partitions on the storage node so nothing to remove in this case
126 if os.path.isdir(DISK_BY_PARTUUID):
127 links = [os.path.join(DISK_BY_PARTUUID, l) for l in os.listdir(DISK_BY_PARTUUID)
128 if os.path.islink(os.path.join(DISK_BY_PARTUUID, l))]
130 # Erase all partitions on current node by creating a new GPT table
131 _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"])
133 print("Error erasing partition table of %(node)s\n"
134 "Return code: %(ret)s reason: %(reason)s" %
135 {"node": disk_node, "ret": ret, "reason": err})
140 if disk_node in os.path.realpath(l):
143 # Create partitions in order
144 used_space_mib = 1 # leave 1 MB at the beginning of the disk
146 for size in partition_sizes:
147 cmd = ['parted', '-s', disk_node, 'unit', 'mib',
149 str(used_space_mib), str(used_space_mib + size)]
150 _, err, ret = command(cmd)
151 parms = {"disk_node": disk_node,
152 "start": used_space_mib,
153 "end": used_space_mib + size,
155 print("Created partition from start=%(start)s MiB to end=%(end)s MiB"
156 " on %(disk_node)s" % parms)
158 print("Failed to create partition with "
159 "start=%(start)s, end=%(end)s "
160 "on %(disk_node)s reason: %(reason)s" % parms)
162 # Set partition type to ceph journal
163 # noncritical operation, it makes 'ceph-disk list' output correct info
165 '--change-name={num}:ceph journal'.format(num=num),
166 '--typecode={num}:{uuid}'.format(
171 _, err, ret = command(cmd)
173 print("WARNINIG: Failed to set partition name and typecode")
174 used_space_mib += size
178 ###########################
179 # Manage Journal Location #
180 ###########################
182 OSD_PATH = "/var/lib/ceph/osd/"
185 def mount_data_partition(data_path, osdid):
186 """Mount an OSD data partition and return the mounted path"""
188 # Obtain the device node from the device path.
189 data_node = device_path_to_device_node(data_path)
191 mount_path = OSD_PATH + "ceph-" + str(osdid)
192 output, _, _ = command(['mount'])
193 regex = "^" + data_node + ".*" + mount_path
194 if not re.search(regex, output, re.MULTILINE):
195 cmd = ['mount', '-t', 'xfs', data_node, mount_path]
196 _, _, ret = command(cmd)
197 params = {"node": data_node, "path": mount_path}
199 print("Failed to mount %(node)s to %(path), aborting" % params)
202 print("Mounted %(node)s to %(path)s" % params)
206 def is_location_correct(path, journal_path, osdid):
207 """Check if location points to the correct device"""
209 # Obtain the device node from the device path.
210 journal_node = device_path_to_device_node(journal_path)
212 cur_node = os.path.realpath(path + "/journal")
213 if cur_node == journal_node:
219 def fix_location(mount_point, journal_path, osdid):
220 """Move the journal to the new partition"""
222 # Obtain the device node from the device path.
223 journal_node = device_path_to_device_node(journal_path)
226 path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd
227 journal_uuid = get_partition_uuid(journal_node)
228 new_target = DISK_BY_PARTUUID + journal_uuid
229 params = {"path": path, "target": new_target}
231 if os.path.lexists(path):
232 os.unlink(path) # delete the old symlink
233 os.symlink(new_target, path)
234 print("Symlink created: %(path)s -> %(target)s" % params)
236 print("Failed to create symlink: %(path)s -> %(target)s" % params)
239 path = mount_point + "/journal_uuid"
241 with open(path, 'w') as f:
242 f.write(journal_uuid)
243 except Exception as ex:
244 # The operation is noncritical, it only makes 'ceph-disk list'
245 # display complete output. We log and continue.
246 params = {"path": path, "uuid": journal_uuid}
247 print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params)
249 # Clean the journal partition
250 # even if erasing the partition table, if another journal was present here
251 # it's going to be reused. Journals are always bigger than 100MB.
252 command(['dd', 'if=/dev/zero', 'of=%s' % journal_node,
253 'bs=1M', 'count=100'])
256 cmd = ['/usr/bin/ceph-osd', '-i', str(osdid),
257 '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid,
258 '-c', '/etc/ceph/ceph.conf',
261 out, err, ret = command(cmd)
262 params = {"journal_node": journal_node,
267 print("Prepared new journal partition: %(journal_node)s "
268 "for osd id: %(osdid)s" % params)
270 print("Error initializing journal node: "
271 "%(journal_node)s for osd id: %(osdid)s "
272 "ceph-osd return code: %(ret)s reason: %(reason)s" % params)
280 # parse and validate arguments
286 elif argv[0] == "partitions":
287 valid_keys = ['disk_path', 'journals']
288 partitions = get_input(argv[1], valid_keys)
291 elif not isinstance(partitions['journals'], list):
293 elif argv[0] == "location":
294 valid_keys = ['data_path', 'journal_path', 'osdid']
295 location = get_input(argv[1], valid_keys)
298 elif not isinstance(location['osdid'], int):
303 print("Command intended for internal use only")
307 # Recreate partitions only if the existing ones don't match input
308 if not is_partitioning_correct(partitions['disk_path'],
309 partitions['journals']):
310 create_partitions(partitions['disk_path'], partitions['journals'])
312 print("Partition table for %s is correct, "
313 "no need to repartition" %
314 device_path_to_device_node(partitions['disk_path']))
316 # we need to have the data partition mounted & we can let it mounted
317 mount_point = mount_data_partition(location['data_path'],
319 # Update journal location only if link point to another partition
320 if not is_location_correct(mount_point,
321 location['journal_path'],
323 print("Fixing journal location for "
324 "OSD id: %(id)s" % {"node": location['data_path'],
325 "id": location['osdid']})
326 fix_location(mount_point,
327 location['journal_path'],
330 print("Journal location for %s is correct,"
331 "no need to change it" % location['data_path'])