meta-stx: re-name and re-org to align with upstream
[pti/rtp.git] / meta-starlingx / meta-stx-virt / recipes-extended / ceph / files / ceph-manage-journal.py
diff --git a/meta-starlingx/meta-stx-virt/recipes-extended/ceph/files/ceph-manage-journal.py b/meta-starlingx/meta-stx-virt/recipes-extended/ceph/files/ceph-manage-journal.py
new file mode 100644 (file)
index 0000000..f91cbc1
--- /dev/null
@@ -0,0 +1,334 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2019 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+import ast
+import os
+import os.path
+import re
+import subprocess
+import sys
+
+DEVICE_NAME_NVME = "nvme"
+
+#########
+# Utils #
+#########
+
+
+def command(arguments, **kwargs):
+    """Execute e command and capture stdout, stderr & return code"""
+    process = subprocess.Popen(
+        arguments,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        **kwargs)
+    out, err = process.communicate()
+    return out, err, process.returncode
+
+
+def get_input(arg, valid_keys):
+    """Convert the input to a dict and perform basic validation"""
+    json_string = arg.replace("\\n", "\n")
+    try:
+        input_dict = ast.literal_eval(json_string)
+        if not all(k in input_dict for k in valid_keys):
+            return None
+    except Exception:
+        return None
+
+    return input_dict
+
+
+def get_partition_uuid(dev):
+    output, _, _ = command(['blkid', dev])
+    try:
+        return re.search('PARTUUID=\"(.+?)\"', output).group(1)
+    except AttributeError:
+        return None
+
+
+def device_path_to_device_node(device_path):
+    try:
+        output, _, _ = command(["udevadm", "settle", "-E", device_path])
+        out, err, retcode = command(["readlink", "-f", device_path])
+        out = out.rstrip()
+    except Exception as e:
+        return None
+
+    return out
+
+
+###########################################
+# Manage Journal Disk Partitioning Scheme #
+###########################################
+
+DISK_BY_PARTUUID = "/dev/disk/by-partuuid/"
+JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'  # Type of a journal partition
+
+
+def is_partitioning_correct(disk_path, partition_sizes):
+    """Validate the existence and size of journal partitions"""
+
+    # Obtain the device node from the device path.
+    disk_node = device_path_to_device_node(disk_path)
+
+    # Check that partition table format is GPT
+    output, _, _ = command(["udevadm", "settle", "-E", disk_node])
+    output, _, _ = command(["parted", "-s", disk_node, "print"])
+    if not re.search('Partition Table: gpt', output):
+        print("Format of disk node %s is not GPT, zapping disk" % disk_node)
+        return False
+
+    # Check each partition size
+    partition_index = 1
+    for size in partition_sizes:
+        # Check that each partition size matches the one in input
+        if DEVICE_NAME_NVME in disk_node:
+            partition_node = '{}p{}'.format(disk_node, str(partition_index))
+        else:
+            partition_node = '{}{}'.format(disk_node, str(partition_index))
+
+        output, _, _ = command(["udevadm", "settle", "-E", partition_node])
+        cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
+        output, _, _ = command(cmd)
+
+        regex = ("^Disk " + str(partition_node) + ":\\s*" +
+                 str(size) + "[\\.0]*MiB")
+        if not re.search(regex, output, re.MULTILINE):
+            print("Journal partition %(node)s size is not %(size)s, "
+                  "zapping disk" % {"node": partition_node, "size": size})
+            return False
+
+        partition_index += 1
+
+    output, _, _ = command(["udevadm", "settle", "-t", "10"])
+    return True
+
+
+def create_partitions(disk_path, partition_sizes):
+    """Recreate partitions"""
+
+    # Obtain the device node from the device path.
+    disk_node = device_path_to_device_node(disk_path)
+
+    # Issue: After creating a new partition table on a device, Udev does not
+    # always remove old symlinks (i.e. to previous partitions on that device).
+    # Also, even if links are erased before zapping the disk, some of them will
+    # be recreated even though there is no partition to back them!
+    # Therefore, we have to remove the links AFTER we erase the partition table
+    # Issue: DISK_BY_PARTUUID directory is not present at all if there are no
+    # GPT partitions on the storage node so nothing to remove in this case
+    links = []
+    if os.path.isdir(DISK_BY_PARTUUID):
+        links = [os.path.join(DISK_BY_PARTUUID, l) for l in os.listdir(DISK_BY_PARTUUID)
+                 if os.path.islink(os.path.join(DISK_BY_PARTUUID, l))]
+
+    # Erase all partitions on current node by creating a new GPT table
+    _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"])
+    if ret:
+        print("Error erasing partition table of %(node)s\n"
+              "Return code: %(ret)s reason: %(reason)s" %
+              {"node": disk_node, "ret": ret, "reason": err})
+        exit(1)
+
+    # Erase old symlinks
+    for l in links:
+        if disk_node in os.path.realpath(l):
+            os.remove(l)
+
+    # Create partitions in order
+    used_space_mib = 1  # leave 1 MB at the beginning of the disk
+    num = 1
+    for size in partition_sizes:
+        cmd = ['parted', '-s', disk_node, 'unit', 'mib',
+               'mkpart', 'primary',
+               str(used_space_mib), str(used_space_mib + size)]
+        _, err, ret = command(cmd)
+        parms = {"disk_node": disk_node,
+                 "start": used_space_mib,
+                 "end": used_space_mib + size,
+                 "reason": err}
+        print("Created partition from start=%(start)s MiB to end=%(end)s MiB"
+              " on %(disk_node)s" % parms)
+        if ret:
+            print("Failed to create partition with "
+                  "start=%(start)s, end=%(end)s "
+                  "on %(disk_node)s reason: %(reason)s" % parms)
+            exit(1)
+        # Set partition type to ceph journal
+        # noncritical operation, it makes 'ceph-disk list' output correct info
+        cmd = ['sgdisk',
+               '--change-name={num}:ceph journal'.format(num=num),
+               '--typecode={num}:{uuid}'.format(
+                   num=num,
+                   uuid=JOURNAL_UUID,
+               ),
+               disk_node]
+        _, err, ret = command(cmd)
+        if ret:
+            print("WARNINIG: Failed to set partition name and typecode")
+        used_space_mib += size
+        num += 1
+
+
+###########################
+# Manage Journal Location #
+###########################
+
+OSD_PATH = "/var/lib/ceph/osd/"
+
+
+def mount_data_partition(data_path, osdid):
+    """Mount an OSD data partition and return the mounted path"""
+
+    # Obtain the device node from the device path.
+    data_node = device_path_to_device_node(data_path)
+
+    mount_path = OSD_PATH + "ceph-" + str(osdid)
+    output, _, _ = command(['mount'])
+    regex = "^" + data_node + ".*" + mount_path
+    if not re.search(regex, output, re.MULTILINE):
+        cmd = ['mount', '-t', 'xfs', data_node, mount_path]
+        _, _, ret = command(cmd)
+        params = {"node": data_node, "path": mount_path}
+        if ret:
+            print("Failed to mount %(node)s to %(path), aborting" % params)
+            exit(1)
+        else:
+            print("Mounted %(node)s to %(path)s" % params)
+    return mount_path
+
+
+def is_location_correct(path, journal_path, osdid):
+    """Check if location points to the correct device"""
+
+    # Obtain the device node from the device path.
+    journal_node = device_path_to_device_node(journal_path)
+
+    cur_node = os.path.realpath(path + "/journal")
+    if cur_node == journal_node:
+        return True
+    else:
+        return False
+
+
+def fix_location(mount_point, journal_path, osdid):
+    """Move the journal to the new partition"""
+
+    # Obtain the device node from the device path.
+    journal_node = device_path_to_device_node(journal_path)
+
+    # Fix symlink
+    path = mount_point + "/journal"  # 'journal' symlink path used by ceph-osd
+    journal_uuid = get_partition_uuid(journal_node)
+    new_target = DISK_BY_PARTUUID + journal_uuid
+    params = {"path": path, "target": new_target}
+    try:
+        if os.path.lexists(path):
+            os.unlink(path)  # delete the old symlink
+        os.symlink(new_target, path)
+        print("Symlink created: %(path)s -> %(target)s" % params)
+    except:
+        print("Failed to create symlink: %(path)s -> %(target)s" % params)
+        exit(1)
+    # Fix journal_uuid
+    path = mount_point + "/journal_uuid"
+    try:
+        with open(path, 'w') as f:
+            f.write(journal_uuid)
+    except Exception as ex:
+        # The operation is noncritical, it only makes 'ceph-disk list'
+        # display complete output. We log and continue.
+        params = {"path": path, "uuid": journal_uuid}
+        print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params)
+
+    # Clean the journal partition
+    # even if erasing the partition table, if another journal was present here
+    # it's going to be reused. Journals are always bigger than 100MB.
+    command(['dd', 'if=/dev/zero', 'of=%s' % journal_node,
+             'bs=1M', 'count=100'])
+
+    # Format the journal
+    cmd = ['/usr/bin/ceph-osd', '-i', str(osdid),
+           '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid,
+           '-c', '/etc/ceph/ceph.conf',
+           '--cluster', 'ceph',
+           '--mkjournal']
+    out, err, ret = command(cmd)
+    params = {"journal_node": journal_node,
+              "osdid": osdid,
+              "ret": ret,
+              "reason": err}
+    if not ret:
+        print("Prepared new journal partition: %(journal_node)s "
+              "for osd id: %(osdid)s" % params)
+    else:
+        print("Error initializing journal node: "
+              "%(journal_node)s for osd id: %(osdid)s "
+              "ceph-osd return code: %(ret)s reason: %(reason)s" % params)
+
+
+########
+# Main #
+########
+
+def main(argv):
+    # parse and validate arguments
+    err = False
+    partitions = None
+    location = None
+    if len(argv) != 2:
+        err = True
+    elif argv[0] == "partitions":
+        valid_keys = ['disk_path', 'journals']
+        partitions = get_input(argv[1], valid_keys)
+        if not partitions:
+            err = True
+        elif not isinstance(partitions['journals'], list):
+            err = True
+    elif argv[0] == "location":
+        valid_keys = ['data_path', 'journal_path', 'osdid']
+        location = get_input(argv[1], valid_keys)
+        if not location:
+            err = True
+        elif not isinstance(location['osdid'], int):
+            err = True
+    else:
+        err = True
+    if err:
+        print("Command intended for internal use only")
+        exit(-1)
+
+    if partitions:
+        # Recreate partitions only if the existing ones don't match input
+        if not is_partitioning_correct(partitions['disk_path'],
+                                       partitions['journals']):
+            create_partitions(partitions['disk_path'], partitions['journals'])
+        else:
+            print("Partition table for %s is correct, "
+                  "no need to repartition" %
+                  device_path_to_device_node(partitions['disk_path']))
+    elif location:
+        # we need to have the data partition mounted & we can let it mounted
+        mount_point = mount_data_partition(location['data_path'],
+                                           location['osdid'])
+        # Update journal location only if link point to another partition
+        if not is_location_correct(mount_point,
+                                   location['journal_path'],
+                                   location['osdid']):
+            print("Fixing journal location for "
+                  "OSD id: %(id)s" % {"node": location['data_path'],
+                                      "id": location['osdid']})
+            fix_location(mount_point,
+                         location['journal_path'],
+                         location['osdid'])
+        else:
+            print("Journal location for %s is correct,"
+                  "no need to change it" % location['data_path'])
+
+
+main(sys.argv[1:])