From 72e26caf360e84d81ac057ad4998c41c119e8fa7 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 22 Feb 2024 15:05:56 -0500 Subject: [PATCH 01/14] Change to unix domain for vtbuffer communication The semaphore arbitrated single channel sharing was proving to be too slow. Make the communication lockless by having dedicated sockets per request. --- confluent_server/confluent/consoleserver.py | 56 +++--- confluent_vtbufferd/vtbufferd.c | 179 +++++++++++++++----- 2 files changed, 161 insertions(+), 74 deletions(-) diff --git a/confluent_server/confluent/consoleserver.py b/confluent_server/confluent/consoleserver.py index ebfd8c97..19509eb5 100644 --- a/confluent_server/confluent/consoleserver.py +++ b/confluent_server/confluent/consoleserver.py @@ -62,39 +62,38 @@ def chunk_output(output, n): yield output[i:i + n] def get_buffer_output(nodename): - out = _bufferdaemon.stdin - instream = _bufferdaemon.stdout + out = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + out.setsockopt(socket.SOL_SOCKET, socket.SO_PASSCRED, 1) + out.connect("\x00confluent-vtbuffer") if not isinstance(nodename, bytes): nodename = nodename.encode('utf8') outdata = bytearray() - with _bufferlock: - out.write(struct.pack('I', len(nodename))) - out.write(nodename) - out.flush() - select.select((instream,), (), (), 30) - while not outdata or outdata[-1]: - try: - chunk = os.read(instream.fileno(), 128) - except IOError: - chunk = None - if chunk: - outdata.extend(chunk) - else: - select.select((instream,), (), (), 0) - return bytes(outdata[:-1]) + out.send(struct.pack('I', len(nodename))) + out.send(nodename) + select.select((out,), (), (), 30) + while not outdata or outdata[-1]: + try: + chunk = os.read(out.fileno(), 128) + except IOError: + chunk = None + if chunk: + outdata.extend(chunk) + else: + select.select((out,), (), (), 0) + return bytes(outdata[:-1]) def send_output(nodename, output): if not isinstance(nodename, bytes): nodename = nodename.encode('utf8') - with _bufferlock: - _bufferdaemon.stdin.write(struct.pack('I', len(nodename) | (1 << 29))) - _bufferdaemon.stdin.write(nodename) - _bufferdaemon.stdin.flush() - for chunk in chunk_output(output, 8192): - _bufferdaemon.stdin.write(struct.pack('I', len(chunk) | (2 << 29))) - _bufferdaemon.stdin.write(chunk) - _bufferdaemon.stdin.flush() + out = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + out.setsockopt(socket.SOL_SOCKET, socket.SO_PASSCRED, 1) + out.connect("\x00confluent-vtbuffer") + out.send(struct.pack('I', len(nodename) | (1 << 29))) + out.send(nodename) + for chunk in chunk_output(output, 8192): + out.send(struct.pack('I', len(chunk) | (2 << 29))) + out.send(chunk) def _utf8_normalize(data, decoder): # first we give the stateful decoder a crack at the byte stream, @@ -607,11 +606,8 @@ def initialize(): _bufferlock = semaphore.Semaphore() _tracelog = log.Logger('trace') _bufferdaemon = subprocess.Popen( - ['/opt/confluent/bin/vtbufferd'], bufsize=0, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - fl = fcntl.fcntl(_bufferdaemon.stdout.fileno(), fcntl.F_GETFL) - fcntl.fcntl(_bufferdaemon.stdout.fileno(), - fcntl.F_SETFL, fl | os.O_NONBLOCK) + ['/opt/confluent/bin/vtbufferd', 'confluent-vtbuffer'], bufsize=0, stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL) def start_console_sessions(): configmodule.hook_new_configmanagers(_start_tenant_sessions) diff --git a/confluent_vtbufferd/vtbufferd.c b/confluent_vtbufferd/vtbufferd.c index e89269b4..055a5263 100644 --- a/confluent_vtbufferd/vtbufferd.c +++ b/confluent_vtbufferd/vtbufferd.c @@ -1,8 +1,14 @@ +#include +#define _GNU_SOURCE #include #include #include #include #include +#include +#include +#include +#include #include "tmt.h" #define HASHSIZE 2053 #define MAXNAMELEN 256 @@ -10,13 +16,17 @@ struct terment { struct terment *next; char *name; + int fd; TMT *vt; }; #define SETNODE 1 #define WRITE 2 #define READBUFF 0 +#define CLOSECONN 3 +#define MAXEVTS 16 static struct terment *buffers[HASHSIZE]; +static char* nodenames[HASHSIZE]; unsigned long hash(char *str) /* djb2a */ @@ -37,10 +47,13 @@ TMT *get_termentbyname(char *name) { return NULL; } -TMT *set_termentbyname(char *name) { +TMT *set_termentbyname(char *name, int fd) { struct terment *ret; int idx; + if (nodenames[fd] == NULL) { + nodenames[fd] = strdup(name); + } idx = hash(name); for (ret = buffers[idx]; ret != NULL; ret = ret->next) if (strcmp(name, ret->name) == 0) @@ -48,12 +61,13 @@ TMT *set_termentbyname(char *name) { ret = (struct terment *)malloc(sizeof(*ret)); ret->next = buffers[idx]; ret->name = strdup(name); + ret->fd = fd; ret->vt = tmt_open(31, 100, NULL, NULL, L"→←↑↓■◆▒°±▒┘┐┌└┼⎺───⎽├┤┴┬│≤≥π≠£•"); buffers[idx] = ret; return ret->vt; } -void dump_vt(TMT* outvt) { +void dump_vt(TMT* outvt, int outfd) { const TMTSCREEN *out = tmt_screen(outvt); const TMTPOINT *curs = tmt_cursor(outvt); int line, idx, maxcol, maxrow; @@ -67,9 +81,10 @@ void dump_vt(TMT* outvt) { tmt_color_t fg = TMT_COLOR_DEFAULT; tmt_color_t bg = TMT_COLOR_DEFAULT; wchar_t sgrline[30]; + char strbuffer[128]; size_t srgidx = 0; char colorcode = 0; - wprintf(L"\033c"); + write(outfd, "\033c", 2); maxcol = 0; maxrow = 0; for (line = out->nline - 1; line >= 0; --line) { @@ -148,60 +163,136 @@ void dump_vt(TMT* outvt) { } if (sgrline[0] != 0) { sgrline[wcslen(sgrline) - 1] = 0; // Trim last ; - wprintf(L"\033[%lsm", sgrline); + + snprintf(strbuffer, sizeof(strbuffer), "\033[%lsm", sgrline); + write(outfd, strbuffer, strlen(strbuffer)); + write(outfd, "\033[]", 3); } - wprintf(L"%lc", out->lines[line]->chars[idx].c); + snprintf(strbuffer, sizeof(strbuffer), "%lc", out->lines[line]->chars[idx].c); + write(outfd, strbuffer, strlen(strbuffer)); } if (line < maxrow) - wprintf(L"\r\n"); + write(outfd, "\r\n", 2); } - fflush(stdout); - wprintf(L"\x1b[%ld;%ldH", curs->r + 1, curs->c + 1); - fflush(stdout); + //fflush(stdout); + snprintf(strbuffer, sizeof(strbuffer), "\x1b[%ld;%ldH", curs->r + 1, curs->c + 1); + write(outfd, strbuffer, strlen(strbuffer)); + //fflush(stdout); +} + +int handle_traffic(int fd) { + int cmd, length; + char currnode[MAXNAMELEN]; + char cmdbuf[MAXDATALEN]; + char *nodename; + TMT *currvt = NULL; + TMT *outvt = NULL; + length = read(fd, &cmd, 4); + if (length <= 0) { + return 0; + } + length = cmd & 536870911; + cmd = cmd >> 29; + if (cmd == SETNODE) { + cmd = read(fd, currnode, length); + currnode[length] = 0; + if (cmd < 0) + return 0; + currvt = set_termentbyname(currnode, fd); + } else if (cmd == WRITE) { + if (currvt == NULL) { + nodename = nodenames[fd]; + currvt = set_termentbyname(nodename, fd); + } + cmd = read(fd, cmdbuf, length); + cmdbuf[length] = 0; + if (cmd < 0) + return 0; + tmt_write(currvt, cmdbuf, length); + } else if (cmd == READBUFF) { + cmd = read(fd, cmdbuf, length); + cmdbuf[length] = 0; + if (cmd < 0) + return 0; + outvt = get_termentbyname(cmdbuf); + if (outvt != NULL) + dump_vt(outvt, fd); + length = write(fd, "\x00", 1); + if (length < 0) + return 0; + } else if (cmd == CLOSECONN) { + return 0; + } + return 1; } int main(int argc, char* argv[]) { - int cmd, length; setlocale(LC_ALL, ""); - char cmdbuf[MAXDATALEN]; - char currnode[MAXNAMELEN]; - TMT *currvt = NULL; - TMT *outvt = NULL; + struct sockaddr_un addr; + int numevts; + int status; + int poller; + int n; + socklen_t len; + int ctlsock, currsock; + socklen_t addrlen; + struct ucred ucr; + + struct epoll_event epvt, evts[MAXEVTS]; stdin = freopen(NULL, "rb", stdin); if (stdin == NULL) { exit(1); } + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path + 1, argv[1], sizeof(addr.sun_path) - 2); // abstract namespace socket + ctlsock = socket(AF_UNIX, SOCK_STREAM, 0); + status = bind(ctlsock, (const struct sockaddr*)&addr, sizeof(sa_family_t) + strlen(argv[1]) + 1); //sizeof(struct sockaddr_un)); + if (status < 0) { + perror("Unable to open unix socket - "); + exit(1); + } + listen(ctlsock, 128); + poller = epoll_create(1); + memset(&epvt, 0, sizeof(struct epoll_event)); + epvt.events = EPOLLIN; + epvt.data.fd = ctlsock; + if (epoll_ctl(poller, EPOLL_CTL_ADD, ctlsock, &epvt) < 0) { + perror("Unable to poll the socket"); + exit(1); + } + // create a unix domain socket for accepting, each connection is only allowed to either read or write, not both while (1) { - length = fread(&cmd, 4, 1, stdin); - if (length < 0) - continue; - length = cmd & 536870911; - cmd = cmd >> 29; - if (cmd == SETNODE) { - cmd = fread(currnode, 1, length, stdin); - currnode[length] = 0; - if (cmd < 0) - continue; - currvt = set_termentbyname(currnode); - } else if (cmd == WRITE) { - if (currvt == NULL) - currvt = set_termentbyname(""); - cmd = fread(cmdbuf, 1, length, stdin); - cmdbuf[length] = 0; - if (cmd < 0) - continue; - tmt_write(currvt, cmdbuf, length); - } else if (cmd == READBUFF) { - cmd = fread(cmdbuf, 1, length, stdin); - cmdbuf[length] = 0; - if (cmd < 0) - continue; - outvt = get_termentbyname(cmdbuf); - if (outvt != NULL) - dump_vt(outvt); - length = write(1, "\x00", 1); - if (length < 0) - continue; + numevts = epoll_wait(poller, evts, MAXEVTS, -1); + if (numevts < 0) { + perror("Failed wait"); + exit(1); + } + for (n = 0; n < numevts; ++n) { + if (evts[n].data.fd == ctlsock) { + currsock = accept(ctlsock, (struct sockaddr *) &addr, &addrlen); + len = sizeof(ucr); + getsockopt(currsock, SOL_SOCKET, SO_PEERCRED, &ucr, &len); + if (ucr.uid != getuid()) { // block access for other users + close(currsock); + continue; + } + memset(&epvt, 0, sizeof(struct epoll_event)); + epvt.events = EPOLLIN; + epvt.data.fd = currsock; + epoll_ctl(poller, EPOLL_CTL_ADD, currsock, &epvt); + } else { + if (!handle_traffic(evts[n].data.fd)) { + epoll_ctl(poller, EPOLL_CTL_DEL, evts[n].data.fd, NULL); + close(evts[n].data.fd); + if (nodenames[evts[n].data.fd] != NULL) { + free(nodenames[evts[n].data.fd]); + nodenames[evts[n].data.fd] = NULL; + } + } + } } } } + + From fa5b1c671ef54e55f9f04b57a894a06dd2f23123 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 22 Feb 2024 15:07:12 -0500 Subject: [PATCH 02/14] Remove disused bufferlock We no longer use a lock on buffer communication, eliminate the stale variable. --- confluent_server/confluent/consoleserver.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/confluent_server/confluent/consoleserver.py b/confluent_server/confluent/consoleserver.py index 19509eb5..aa05b9b7 100644 --- a/confluent_server/confluent/consoleserver.py +++ b/confluent_server/confluent/consoleserver.py @@ -49,7 +49,6 @@ _handled_consoles = {} _tracelog = None _bufferdaemon = None -_bufferlock = None try: range = xrange @@ -602,8 +601,6 @@ def _start_tenant_sessions(cfm): def initialize(): global _tracelog global _bufferdaemon - global _bufferlock - _bufferlock = semaphore.Semaphore() _tracelog = log.Logger('trace') _bufferdaemon = subprocess.Popen( ['/opt/confluent/bin/vtbufferd', 'confluent-vtbuffer'], bufsize=0, stdin=subprocess.DEVNULL, From 75db6da621632db72e40d1a208c812f327c0b6f1 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 4 Mar 2024 08:06:01 -0500 Subject: [PATCH 03/14] Opportunisticlly use sshd_config.d when detected --- .../el8/profiles/default/scripts/setupssh.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/confluent_osdeploy/el8/profiles/default/scripts/setupssh.sh b/confluent_osdeploy/el8/profiles/default/scripts/setupssh.sh index f06c4d61..bc74faf5 100644 --- a/confluent_osdeploy/el8/profiles/default/scripts/setupssh.sh +++ b/confluent_osdeploy/el8/profiles/default/scripts/setupssh.sh @@ -1,8 +1,12 @@ #!/bin/sh -grep HostCert /etc/ssh/sshd_config.anaconda >> /mnt/sysimage/etc/ssh/sshd_config -echo HostbasedAuthentication yes >> /mnt/sysimage/etc/ssh/sshd_config -echo HostbasedUsesNameFromPacketOnly yes >> /mnt/sysimage/etc/ssh/sshd_config -echo IgnoreRhosts no >> /mnt/sysimage/etc/ssh/sshd_config +targssh=/mnt/sysimage/etc/ssh/sshd_config +if [ -d /mnt/sysimage/etc/ssh/sshd_config.d/ ]; then + targssh=/mnt/sysimage/etc/ssh/sshd_config.d/90-confluent.conf +fi +grep HostCert /etc/ssh/sshd_config.anaconda >> $targssh +echo HostbasedAuthentication yes >> $targssh +echo HostbasedUsesNameFromPacketOnly yes >> $targssh +echo IgnoreRhosts no >> $targssh sshconf=/mnt/sysimage/etc/ssh/ssh_config if [ -d /mnt/sysimage/etc/ssh/ssh_config.d/ ]; then sshconf=/mnt/sysimage/etc/ssh/ssh_config.d/01-confluent.conf From 2f8dfac9bce1e127d742f7cade64c3a3522369e1 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 6 Mar 2024 08:45:23 -0500 Subject: [PATCH 04/14] Dump stderr to client if ansible had an utterly disastrous condition --- confluent_server/confluent/runansible.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/confluent_server/confluent/runansible.py b/confluent_server/confluent/runansible.py index cbbecc58..8e5d1a3d 100644 --- a/confluent_server/confluent/runansible.py +++ b/confluent_server/confluent/runansible.py @@ -63,6 +63,9 @@ class PlayRunner(object): else: textout += result['state'] + '\n' textout += '\n' + if self.stderr: + textout += "ERRORS **********************************\n" + textout += self.stderr return textout def dump_json(self): From 5ae3f4c62aa8b9df37f4cff335d089a1a3717363 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 6 Mar 2024 09:27:53 -0500 Subject: [PATCH 05/14] Properly address runansible error relay --- confluent_server/confluent/runansible.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/runansible.py b/confluent_server/confluent/runansible.py index 8e5d1a3d..50696742 100644 --- a/confluent_server/confluent/runansible.py +++ b/confluent_server/confluent/runansible.py @@ -32,6 +32,7 @@ anspypath = None running_status = {} class PlayRunner(object): def __init__(self, playfiles, nodes): + self.stderr = '' self.playfiles = playfiles self.nodes = nodes self.worker = None @@ -96,7 +97,8 @@ class PlayRunner(object): [mypath, __file__, targnodes, playfilename], stdin=devnull, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, self.stderr = worker.communicate() + stdout, stder = worker.communicate() + self.stderr += stder.decode('utf8') current = memoryview(stdout) while len(current): sz = struct.unpack('=q', current[:8])[0] From 3ffeef5cf306d4c6040b898a0e1b7ad34a4e8a22 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 6 Mar 2024 16:27:09 -0500 Subject: [PATCH 06/14] Fix stray blank line at end of nodelist Wrong indentation level for nodelist resulting in spurious line. --- confluent_client/bin/nodelist | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluent_client/bin/nodelist b/confluent_client/bin/nodelist index 462ed922..c1b9c436 100755 --- a/confluent_client/bin/nodelist +++ b/confluent_client/bin/nodelist @@ -68,7 +68,7 @@ def main(): else: elem=(res['item']['href'].replace('/', '')) list.append(elem) - print(options.delim.join(list)) + print(options.delim.join(list)) sys.exit(exitcode) From cdefb400f9b1eaf94142d350fdc8f7c1006fac41 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 11 Mar 2024 13:32:45 -0400 Subject: [PATCH 07/14] Expose fingerprinting and better error handling to osdeploy This allows custom name and pre-import checking. --- confluent_client/confluent_env.sh | 4 ++-- confluent_server/bin/osdeploy | 35 +++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/confluent_client/confluent_env.sh b/confluent_client/confluent_env.sh index 81a70198..925a873d 100644 --- a/confluent_client/confluent_env.sh +++ b/confluent_client/confluent_env.sh @@ -153,11 +153,11 @@ _confluent_osimage_completion() { _confluent_get_args if [ $NUMARGS == 2 ]; then - COMPREPLY=($(compgen -W "initialize import updateboot rebase" -- ${COMP_WORDS[COMP_CWORD]})) + COMPREPLY=($(compgen -W "initialize import importcheck updateboot rebase" -- ${COMP_WORDS[COMP_CWORD]})) return elif [ ${CMPARGS[1]} == 'initialize' ]; then COMPREPLY=($(compgen -W "-h -u -s -t -i" -- ${COMP_WORDS[COMP_CWORD]})) - elif [ ${CMPARGS[1]} == 'import' ]; then + elif [ ${CMPARGS[1]} == 'import' ] || [ ${CMPARGS[1]} == 'importcheck' ]; then compopt -o default COMPREPLY=() return diff --git a/confluent_server/bin/osdeploy b/confluent_server/bin/osdeploy index fff220be..47ebc4a8 100644 --- a/confluent_server/bin/osdeploy +++ b/confluent_server/bin/osdeploy @@ -1,4 +1,4 @@ -#!/usr/bin/python2 +#!/usr/bin/python3 __author__ = 'jjohnson2,bfinley' @@ -49,8 +49,11 @@ def main(args): wiz.add_argument('-p', help='Copy in TFTP contents required for PXE support', action='store_true') wiz.add_argument('-i', help='Interactively prompt for behaviors', action='store_true') wiz.add_argument('-l', help='Set up local management node to allow login from managed nodes', action='store_true') + osip = sp.add_parser('importcheck', help='Check import of an OS image from an ISO image') + osip.add_argument('imagefile', help='File to use for source of importing') osip = sp.add_parser('import', help='Import an OS image from an ISO image') osip.add_argument('imagefile', help='File to use for source of importing') + osip.add_argument('-n', help='Specific a custom distribution name') upb = sp.add_parser( 'updateboot', help='Push profile.yaml of the named profile data into boot assets as appropriate') @@ -63,7 +66,9 @@ def main(args): if cmdset.command == 'list': return oslist() if cmdset.command == 'import': - return osimport(cmdset.imagefile) + return osimport(cmdset.imagefile, custname=cmdset.n) + if cmdset.command == 'importcheck': + return osimport(cmdset.imagefile, checkonly=True) if cmdset.command == 'initialize': return initialize(cmdset) if cmdset.command == 'updateboot': @@ -496,7 +501,7 @@ def oslist(): print("") -def osimport(imagefile): +def osimport(imagefile, checkonly=False, custname=None): c = client.Command() imagefile = os.path.abspath(imagefile) if c.unixdomain: @@ -507,11 +512,33 @@ def osimport(imagefile): pass importing = False shortname = None - for rsp in c.create('/deployment/importing/', {'filename': imagefile}): + apipath = '/deployment/importing/' + if checkonly: + apipath = '/deployment/fingerprint/' + apiargs = {'filename': imagefile} + if custname: + apiargs['custname'] = custname + for rsp in c.create(apipath, apiargs): if 'target' in rsp: importing = True shortname = rsp['name'] print('Importing from {0} to {1}'.format(imagefile, rsp['target'])) + elif 'targetpath' in rsp: + tpath = rsp.get('targetpath', None) + tname = rsp.get('name', None) + oscat = rsp.get('oscategory', None) + if tpath: + print('Detected target directory: ' + tpath) + if tname: + print('Detected distribution name: ' + tname) + if oscat: + print('Detected OS category: ' + oscat) + for err in rsp.get('errors', []): + sys.stderr.write('Error: ' + err + '\n') + + elif 'error' in rsp: + sys.stderr.write(rsp['error'] + '\n') + sys.exit(rsp.get('errorcode', 1)) else: print(repr(rsp)) try: From 49e614eb32fe1cf3f2887932cf5b5d4b71092220 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 11 Mar 2024 17:10:33 -0400 Subject: [PATCH 08/14] Have image2disk delay exit on error Debugging cloning is difficult when system immediately reboots on error. --- .../el8-diskless/profiles/default/scripts/image2disk.py | 8 +++++++- .../el9-diskless/profiles/default/scripts/image2disk.py | 8 +++++++- .../profiles/default/scripts/image2disk.py | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/confluent_osdeploy/el8-diskless/profiles/default/scripts/image2disk.py b/confluent_osdeploy/el8-diskless/profiles/default/scripts/image2disk.py index aaaca9d4..655aaedc 100644 --- a/confluent_osdeploy/el8-diskless/profiles/default/scripts/image2disk.py +++ b/confluent_osdeploy/el8-diskless/profiles/default/scripts/image2disk.py @@ -10,6 +10,7 @@ import stat import struct import sys import subprocess +import traceback bootuuid = None @@ -426,4 +427,9 @@ def install_to_disk(imgpath): if __name__ == '__main__': - install_to_disk(os.environ['mountsrc']) + try: + install_to_disk(os.environ['mountsrc']) + except Exception: + traceback.print_exc() + time.sleep(86400) + raise diff --git a/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py b/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py index 7b312a93..48a15767 100644 --- a/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py +++ b/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py @@ -10,6 +10,7 @@ import stat import struct import sys import subprocess +import traceback bootuuid = None @@ -426,4 +427,9 @@ def install_to_disk(imgpath): if __name__ == '__main__': - install_to_disk(os.environ['mountsrc']) + try: + install_to_disk(os.environ['mountsrc']) + except Exception: + traceback.print_exc() + time.sleep(86400) + raise diff --git a/confluent_osdeploy/ubuntu20.04-diskless/profiles/default/scripts/image2disk.py b/confluent_osdeploy/ubuntu20.04-diskless/profiles/default/scripts/image2disk.py index 1d19ebad..91afc5cb 100644 --- a/confluent_osdeploy/ubuntu20.04-diskless/profiles/default/scripts/image2disk.py +++ b/confluent_osdeploy/ubuntu20.04-diskless/profiles/default/scripts/image2disk.py @@ -10,6 +10,7 @@ import stat import struct import sys import subprocess +import traceback bootuuid = None @@ -424,5 +425,10 @@ def install_to_disk(imgpath): if __name__ == '__main__': - install_to_disk(os.environ['mountsrc']) + try: + install_to_disk(os.environ['mountsrc']) + except Exception: + traceback.print_exc() + time.sleep(86400) + raise From 0d720baf2539a188d5b80cf4721bdcf5bcab66e8 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Tue, 12 Mar 2024 09:36:40 -0400 Subject: [PATCH 09/14] Fix lldp when peername is null Some neighbors result in a null name, handle that. --- confluent_server/confluent/networking/lldp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/confluent_server/confluent/networking/lldp.py b/confluent_server/confluent/networking/lldp.py index e1fd8d4e..e181d46f 100644 --- a/confluent_server/confluent/networking/lldp.py +++ b/confluent_server/confluent/networking/lldp.py @@ -381,9 +381,10 @@ def list_info(parms, requestedparameter): break else: candidate = info[requestedparameter] - candidate = candidate.strip() - if candidate != '': - results.add(_api_sanitize_string(candidate)) + if candidate: + candidate = candidate.strip() + if candidate != '': + results.add(_api_sanitize_string(candidate)) return [msg.ChildCollection(x + suffix) for x in util.natural_sort(results)] def _handle_neighbor_query(pathcomponents, configmanager): From 17af9c74b81927601e84845965de421db9deb022 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Tue, 12 Mar 2024 15:32:44 -0400 Subject: [PATCH 10/14] Fix nodeapply redoing a single node multiple times --- confluent_client/bin/nodeapply | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/confluent_client/bin/nodeapply b/confluent_client/bin/nodeapply index e39447bc..2e798742 100755 --- a/confluent_client/bin/nodeapply +++ b/confluent_client/bin/nodeapply @@ -102,9 +102,9 @@ def run(): cmdv = ['ssh', sshnode] + cmdvbase + cmdstorun[0] if currprocs < concurrentprocs: currprocs += 1 - run_cmdv(node, cmdv, all, pipedesc) + run_cmdv(sshnode, cmdv, all, pipedesc) else: - pendingexecs.append((node, cmdv)) + pendingexecs.append((sshnode, cmdv)) if not all or exitcode: sys.exit(exitcode) rdy, _, _ = select.select(all, [], [], 10) From 58d9bc1816101ac814beaa32d4da237e35aea9bc Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 14 Mar 2024 10:50:01 -0400 Subject: [PATCH 11/14] Updates to confluent_selfcheck Reap ssh-agent to avoid stale agents lying around. Remove nuisance warnings about virbr0 when present. Do a full runthrough as the confluent user to ssh to a node when user requests with '-a', marking known_hosts and automation key issues. --- confluent_server/bin/confluent_selfcheck | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/confluent_server/bin/confluent_selfcheck b/confluent_server/bin/confluent_selfcheck index cc1409cf..1539434f 100755 --- a/confluent_server/bin/confluent_selfcheck +++ b/confluent_server/bin/confluent_selfcheck @@ -22,6 +22,8 @@ import shutil import eventlet.green.socket as socket import eventlet import greenlet +import pwd +import signal def fprint(txt): sys.stdout.write(txt) @@ -109,6 +111,8 @@ def nics_missing_ipv6(): iname, state = comps[:2] if iname == b'lo': continue + if iname == b'virbr0': + continue addrs = comps[2:] hasv6 = False hasv4 = False @@ -157,6 +161,7 @@ def lookup_node(node): if __name__ == '__main__': ap = argparse.ArgumentParser(description='Run configuration checks for a system running confluent service') ap.add_argument('-n', '--node', help='A node name to run node specific checks against') + ap.add_argument('-a', '--automation', help='Do checks against a deployed node for automation and syncfiles function', action='store_true') args, extra = ap.parse_known_args(sys.argv) if len(extra) > 1: ap.print_help() @@ -217,6 +222,7 @@ if __name__ == '__main__': print('OK') except subprocess.CalledProcessError: emprint('Failed to load confluent automation key, syncfiles and profile ansible plays will not work (Example resolution: osdeploy initialize -a)') + os.kill(int(sshutil.agent_pid), signal.SIGTERM) fprint('Checking for blocked insecure boot: ') if insecure_boot_attempts(): emprint('Some nodes are attempting network boot using PXE or HTTP boot, but the node is not configured to allow this (Example resolution: nodegroupattrib everything deployment.useinsecureprotocols=firmware)') @@ -311,6 +317,34 @@ if __name__ == '__main__': emprint('Name resolution failed for node, it is normally a good idea for the node name to resolve to an IP') if result: print("OK") + if args.automation: + print(f'Checking confluent automation access to {args.node}...') + child = os.fork() + if child > 0: + pid, extcode = os.waitpid(child, 0) + else: + sshutil.ready_keys = {} + sshutil.agent_pid = None + cuser = pwd.getpwnam('confluent') + os.setgid(cuser.pw_gid) + os.setuid(cuser.pw_uid) + sshutil.prep_ssh_key('/etc/confluent/ssh/automation') + srun = subprocess.run( + ['ssh', '-Tn', '-o', 'BatchMode=yes', '-l', 'root', + '-o', 'StrictHostKeyChecking=yes', args.node, 'true'], + stdin=subprocess.DEVNULL, stderr=subprocess.PIPE) + os.kill(int(sshutil.agent_pid), signal.SIGTERM) + if srun.returncode == 0: + print(f'Confluent automation access to {args.node} seems OK') + else: + if b'Host key verification failed' in srun.stderr: + emprint('Confluent ssh unable to verify host key, check /etc/ssh/ssh_known_hosts. (Example resolution: osdeploy initialize -k)') + elif b'ermission denied' in srun.stderr: + emprint('Confluent user unable to ssh in, check /root/.ssh/authorized_keys on the target system versus /etc/confluent/ssh/automation.pub (Example resolution: osdeploy initialize -a)') + else: + emprint('Unknown error attempting confluent automation ssh:') + sys.stderr.buffer.write(srun.stderr) + os.kill(int(sshutil.agent_pid), signal.SIGTERM) else: print("Skipping node checks, no node specified (Example: confluent_selfcheck -n n1)") # possible checks: From 876b59c1f0a2998ad58888f33c4fb099da5f7319 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 14 Mar 2024 10:52:52 -0400 Subject: [PATCH 12/14] Remove redundant kill on the agent pid Extraneous kill on the agent pid is removed. --- confluent_server/bin/confluent_selfcheck | 1 - 1 file changed, 1 deletion(-) diff --git a/confluent_server/bin/confluent_selfcheck b/confluent_server/bin/confluent_selfcheck index 1539434f..f558cf46 100755 --- a/confluent_server/bin/confluent_selfcheck +++ b/confluent_server/bin/confluent_selfcheck @@ -344,7 +344,6 @@ if __name__ == '__main__': else: emprint('Unknown error attempting confluent automation ssh:') sys.stderr.buffer.write(srun.stderr) - os.kill(int(sshutil.agent_pid), signal.SIGTERM) else: print("Skipping node checks, no node specified (Example: confluent_selfcheck -n n1)") # possible checks: From 1d4505ff3ca1916e1a4eeed5a7b3d886477c9c25 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 14 Mar 2024 11:20:36 -0400 Subject: [PATCH 13/14] SSH test by IP, to reflect actual usage and catch issues One issue is modified ssh_known_hosts wildcard customization failing to cover IP address. --- confluent_server/bin/confluent_selfcheck | 33 ++++++++++++++---------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/confluent_server/bin/confluent_selfcheck b/confluent_server/bin/confluent_selfcheck index f558cf46..b9651d17 100755 --- a/confluent_server/bin/confluent_selfcheck +++ b/confluent_server/bin/confluent_selfcheck @@ -280,13 +280,17 @@ if __name__ == '__main__': cfg = configmanager.ConfigManager(None) bootablev4nics = [] bootablev6nics = [] + targsships = [] for nic in glob.glob("/sys/class/net/*/ifindex"): idx = int(open(nic, "r").read()) nicname = nic.split('/')[-2] ncfg = netutil.get_nic_config(cfg, args.node, ifidx=idx) + if ncfg['ipv4_address']: + targsships.append(ncfg['ipv4_address']) if ncfg['ipv4_address'] or ncfg['ipv4_method'] == 'dhcp': bootablev4nics.append(nicname) if ncfg['ipv6_address']: + targsships.append(ncfg['ipv6_address']) bootablev6nics.append(nicname) if bootablev4nics: print("{} appears to have network configuration suitable for IPv4 deployment via: {}".format(args.node, ",".join(bootablev4nics))) @@ -329,21 +333,22 @@ if __name__ == '__main__': os.setgid(cuser.pw_gid) os.setuid(cuser.pw_uid) sshutil.prep_ssh_key('/etc/confluent/ssh/automation') - srun = subprocess.run( - ['ssh', '-Tn', '-o', 'BatchMode=yes', '-l', 'root', - '-o', 'StrictHostKeyChecking=yes', args.node, 'true'], - stdin=subprocess.DEVNULL, stderr=subprocess.PIPE) - os.kill(int(sshutil.agent_pid), signal.SIGTERM) - if srun.returncode == 0: - print(f'Confluent automation access to {args.node} seems OK') - else: - if b'Host key verification failed' in srun.stderr: - emprint('Confluent ssh unable to verify host key, check /etc/ssh/ssh_known_hosts. (Example resolution: osdeploy initialize -k)') - elif b'ermission denied' in srun.stderr: - emprint('Confluent user unable to ssh in, check /root/.ssh/authorized_keys on the target system versus /etc/confluent/ssh/automation.pub (Example resolution: osdeploy initialize -a)') + for targ in targsships: + srun = subprocess.run( + ['ssh', '-Tn', '-o', 'BatchMode=yes', '-l', 'root', + '-o', 'StrictHostKeyChecking=yes', targ, 'true'], + stdin=subprocess.DEVNULL, stderr=subprocess.PIPE) + if srun.returncode == 0: + print(f'Confluent automation access to {targ} seems OK') else: - emprint('Unknown error attempting confluent automation ssh:') - sys.stderr.buffer.write(srun.stderr) + if b'Host key verification failed' in srun.stderr: + emprint(f'Confluent ssh unable to verify host key for {targ}, check /etc/ssh/ssh_known_hosts. (Example resolution: osdeploy initialize -k)') + elif b'ermission denied' in srun.stderr: + emprint(f'Confluent user unable to ssh in to {targ}, check /root/.ssh/authorized_keys on the target system versus /etc/confluent/ssh/automation.pub (Example resolution: osdeploy initialize -a)') + else: + emprint('Unknown error attempting confluent automation ssh:') + sys.stderr.buffer.write(srun.stderr) + os.kill(int(sshutil.agent_pid), signal.SIGTERM) else: print("Skipping node checks, no node specified (Example: confluent_selfcheck -n n1)") # possible checks: From 789376029dc56c9a6bceb18c1cf0476f37bf1df5 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Fri, 15 Mar 2024 09:57:23 -0400 Subject: [PATCH 14/14] Numerous fixes to the EL9 cloning Fix various callbacks when using IPv6 based deployment. Do not attempt to restore erroneously cloned zram partitions. Convert LVM names to new LVM names consistent with source naming scheme. Push new kernel command line into /boot/loader and /etc/kernel/cmdline. --- .../profiles/default/scripts/firstboot.sh | 2 +- .../profiles/default/scripts/image2disk.py | 99 ++++++++++++++++--- .../profiles/default/scripts/post.sh | 6 +- 3 files changed, 91 insertions(+), 16 deletions(-) diff --git a/confluent_osdeploy/el9-diskless/profiles/default/scripts/firstboot.sh b/confluent_osdeploy/el9-diskless/profiles/default/scripts/firstboot.sh index ed11d9e7..fabb9385 100644 --- a/confluent_osdeploy/el9-diskless/profiles/default/scripts/firstboot.sh +++ b/confluent_osdeploy/el9-diskless/profiles/default/scripts/firstboot.sh @@ -41,7 +41,7 @@ if [ ! -f /etc/confluent/firstboot.ran ]; then run_remote_config firstboot.d fi -curl -X POST -d 'status: complete' -H "CONFLUENT_NODENAME: $nodename" -H "CONFLUENT_APIKEY: $confluent_apikey" https://$confluent_mgr/confluent-api/self/updatestatus +curl -X POST -d 'status: complete' -H "CONFLUENT_NODENAME: $nodename" -H "CONFLUENT_APIKEY: $confluent_apikey" https://$confluent_websrv/confluent-api/self/updatestatus systemctl disable firstboot rm /etc/systemd/system/firstboot.service rm /etc/confluent/firstboot.ran diff --git a/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py b/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py index 48a15767..83cffc6b 100644 --- a/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py +++ b/confluent_osdeploy/el9-diskless/profiles/default/scripts/image2disk.py @@ -13,6 +13,13 @@ import subprocess import traceback bootuuid = None +vgname = 'localstorage' +oldvgname = None + +def convert_lv(oldlvname): + if oldvgname is None: + return None + return oldlvname.replace(oldvgname, vgname) def get_partname(devname, idx): if devname[-1] in '0123456789': @@ -54,6 +61,8 @@ def get_image_metadata(imgpath): header = img.read(16) if header == b'\x63\x7b\x9d\x26\xb7\xfd\x48\x30\x89\xf9\x11\xcf\x18\xfd\xff\xa1': for md in get_multipart_image_meta(img): + if md.get('device', '').startswith('/dev/zram'): + continue yield md else: raise Exception('Installation from single part image not supported') @@ -87,14 +96,14 @@ def fixup(rootdir, vols): if tab.startswith('#ORIGFSTAB#'): if entry[1] in devbymount: targetdev = devbymount[entry[1]] - if targetdev.startswith('/dev/localstorage/'): + if targetdev.startswith('/dev/{}/'.format(vgname)): entry[0] = targetdev else: uuid = subprocess.check_output(['blkid', '-s', 'UUID', '-o', 'value', targetdev]).decode('utf8') uuid = uuid.strip() entry[0] = 'UUID={}'.format(uuid) elif entry[2] == 'swap': - entry[0] = '/dev/mapper/localstorage-swap' + entry[0] = '/dev/mapper/{}-swap'.format(vgname) entry[0] = entry[0].ljust(42) entry[1] = entry[1].ljust(16) entry[3] = entry[3].ljust(28) @@ -142,6 +151,46 @@ def fixup(rootdir, vols): grubsyscfg = os.path.join(rootdir, 'etc/sysconfig/grub') if not os.path.exists(grubsyscfg): grubsyscfg = os.path.join(rootdir, 'etc/default/grub') + kcmdline = os.path.join(rootdir, 'etc/kernel/cmdline') + if os.path.exists(kcmdline): + with open(kcmdline) as kcmdlinein: + kcmdlinecontent = kcmdlinein.read() + newkcmdlineent = [] + for ent in kcmdlinecontent.split(): + if ent.startswith('resume='): + newkcmdlineent.append('resume={}'.format(newswapdev)) + elif ent.startswith('root='): + newkcmdlineent.append('root={}'.format(newrootdev)) + elif ent.startswith('rd.lvm.lv='): + ent = convert_lv(ent) + if ent: + newkcmdlineent.append(ent) + else: + newkcmdlineent.append(ent) + with open(kcmdline, 'w') as kcmdlineout: + kcmdlineout.write(' '.join(newkcmdlineent) + '\n') + for loadent in glob.glob(os.path.join(rootdir, 'boot/loader/entries/*.conf')): + with open(loadent) as loadentin: + currentry = loadentin.read().split('\n') + with open(loadent, 'w') as loadentout: + for cfgline in currentry: + cfgparts = cfgline.split() + if not cfgparts or cfgparts[0] != 'options': + loadentout.write(cfgline + '\n') + continue + newcfgparts = [cfgparts[0]] + for cfgpart in cfgparts[1:]: + if cfgpart.startswith('root='): + newcfgparts.append('root={}'.format(newrootdev)) + elif cfgpart.startswith('resume='): + newcfgparts.append('resume={}'.format(newswapdev)) + elif cfgpart.startswith('rd.lvm.lv='): + cfgpart = convert_lv(cfgpart) + if cfgpart: + newcfgparts.append(cfgpart) + else: + newcfgparts.append(cfgpart) + loadentout.write(' '.join(newcfgparts) + '\n') with open(grubsyscfg) as defgrubin: defgrub = defgrubin.read().split('\n') with open(grubsyscfg, 'w') as defgrubout: @@ -149,9 +198,16 @@ def fixup(rootdir, vols): gline = gline.split() newline = [] for ent in gline: - if ent.startswith('resume=') or ent.startswith('rd.lvm.lv'): - continue - newline.append(ent) + if ent.startswith('resume='): + newline.append('resume={}'.format(newswapdev)) + elif ent.startswith('root='): + newline.append('root={}'.format(newrootdev)) + elif ent.startswith('rd.lvm.lv='): + ent = convert_lv(ent) + if ent: + newline.append(ent) + else: + newline.append(ent) defgrubout.write(' '.join(newline) + '\n') grubcfg = subprocess.check_output(['find', os.path.join(rootdir, 'boot'), '-name', 'grub.cfg']).decode('utf8').strip().replace(rootdir, '/').replace('//', '/') grubcfg = grubcfg.split('\n') @@ -228,8 +284,14 @@ def had_swap(): return True return False +newrootdev = None +newswapdev = None def install_to_disk(imgpath): global bootuuid + global newrootdev + global newswapdev + global vgname + global oldvgname lvmvols = {} deftotsize = 0 mintotsize = 0 @@ -261,6 +323,12 @@ def install_to_disk(imgpath): biggestfs = fs biggestsize = fs['initsize'] if fs['device'].startswith('/dev/mapper'): + oldvgname = fs['device'].rsplit('/', 1)[-1] + if '_' in oldvgname and '-' in oldvgname.split('_')[-1]: + oldvgname = oldvgname.rsplit('-', 1)[0] + osname = oldvgname.split('_')[0] + nodename = socket.gethostname().split('.')[0] + vgname = '{}_{}'.format(osname, nodename) lvmvols[fs['device'].replace('/dev/mapper/', '')] = fs deflvmsize += fs['initsize'] minlvmsize += fs['minsize'] @@ -305,6 +373,8 @@ def install_to_disk(imgpath): end = sectors parted.run('mkpart primary {}s {}s'.format(curroffset, end)) vol['targetdisk'] = get_partname(instdisk, volidx) + if vol['mount'] == '/': + newrootdev = vol['targetdisk'] curroffset += size + 1 if not lvmvols: if swapsize: @@ -314,13 +384,14 @@ def install_to_disk(imgpath): if end > sectors: end = sectors parted.run('mkpart swap {}s {}s'.format(curroffset, end)) - subprocess.check_call(['mkswap', get_partname(instdisk, volidx + 1)]) + newswapdev = get_partname(instdisk, volidx + 1) + subprocess.check_call(['mkswap', newswapdev]) else: parted.run('mkpart lvm {}s 100%'.format(curroffset)) lvmpart = get_partname(instdisk, volidx + 1) subprocess.check_call(['pvcreate', '-ff', '-y', lvmpart]) - subprocess.check_call(['vgcreate', 'localstorage', lvmpart]) - vginfo = subprocess.check_output(['vgdisplay', 'localstorage', '--units', 'b']).decode('utf8') + subprocess.check_call(['vgcreate', vgname, lvmpart]) + vginfo = subprocess.check_output(['vgdisplay', vgname, '--units', 'b']).decode('utf8') vginfo = vginfo.split('\n') pesize = 0 pes = 0 @@ -347,13 +418,17 @@ def install_to_disk(imgpath): extents += 1 if vol['mount'] == '/': lvname = 'root' + else: lvname = vol['mount'].replace('/', '_') - subprocess.check_call(['lvcreate', '-l', '{}'.format(extents), '-y', '-n', lvname, 'localstorage']) - vol['targetdisk'] = '/dev/localstorage/{}'.format(lvname) + subprocess.check_call(['lvcreate', '-l', '{}'.format(extents), '-y', '-n', lvname, vgname]) + vol['targetdisk'] = '/dev/{}/{}'.format(vgname, lvname) + if vol['mount'] == '/': + newrootdev = vol['targetdisk'] if swapsize: - subprocess.check_call(['lvcreate', '-y', '-l', '{}'.format(swapsize // pesize), '-n', 'swap', 'localstorage']) - subprocess.check_call(['mkswap', '/dev/localstorage/swap']) + subprocess.check_call(['lvcreate', '-y', '-l', '{}'.format(swapsize // pesize), '-n', 'swap', vgname]) + subprocess.check_call(['mkswap', '/dev/{}/swap'.format(vgname)]) + newswapdev = '/dev/{}/swap'.format(vgname) os.makedirs('/run/imginst/targ') for vol in allvols: with open(vol['targetdisk'], 'wb') as partition: diff --git a/confluent_osdeploy/el9-diskless/profiles/default/scripts/post.sh b/confluent_osdeploy/el9-diskless/profiles/default/scripts/post.sh index 3b20a946..7a7ac01e 100644 --- a/confluent_osdeploy/el9-diskless/profiles/default/scripts/post.sh +++ b/confluent_osdeploy/el9-diskless/profiles/default/scripts/post.sh @@ -23,9 +23,9 @@ exec 2>> /var/log/confluent/confluent-post.log chmod 600 /var/log/confluent/confluent-post.log tail -f /var/log/confluent/confluent-post.log > /dev/console & logshowpid=$! -curl -f https://$confluent_mgr/confluent-public/os/$confluent_profile/scripts/firstboot.service > /etc/systemd/system/firstboot.service +curl -f https://$confluent_websrv/confluent-public/os/$confluent_profile/scripts/firstboot.service > /etc/systemd/system/firstboot.service mkdir -p /opt/confluent/bin -curl -f https://$confluent_mgr/confluent-public/os/$confluent_profile/scripts/firstboot.sh > /opt/confluent/bin/firstboot.sh +curl -f https://$confluent_websrv/confluent-public/os/$confluent_profile/scripts/firstboot.sh > /opt/confluent/bin/firstboot.sh chmod +x /opt/confluent/bin/firstboot.sh systemctl enable firstboot selinuxpolicy=$(grep ^SELINUXTYPE /etc/selinux/config |awk -F= '{print $2}') @@ -40,7 +40,7 @@ run_remote_parts post.d # Induce execution of remote configuration, e.g. ansible plays in ansible/post.d/ run_remote_config post.d -curl -sf -X POST -d 'status: staged' -H "CONFLUENT_NODENAME: $nodename" -H "CONFLUENT_APIKEY: $confluent_apikey" https://$confluent_mgr/confluent-api/self/updatestatus +curl -sf -X POST -d 'status: staged' -H "CONFLUENT_NODENAME: $nodename" -H "CONFLUENT_APIKEY: $confluent_apikey" https://$confluent_websrv/confluent-api/self/updatestatus kill $logshowpid