Published a year ago
Published a year ago
s3tupw1zard
Updated a year ago
0
Hello,
I adapted the compose.yaml for running it in docker swarm. As soon as I deploy the stack I can see that my luigi and mgt services can't connect to my postgres service using either the hostname postgres or tasks.postgres. Also luigi can't connect to detect service.
I put the bind mounts on ceph managed through proxmox so that every host has anytime the same data.
This is my compose.yaml:
networks:
safeline_net:
driver: overlay
attachable: true
ipam:
config:
- subnet: ${SUBNET_PREFIX}.0/24
proxy_net:
driver: overlay
attachable: true
services:
postgres:
image: ${IMAGE_PREFIX}/safeline-postgres${ARCH_SUFFIX}:15.2
hostname: postgres
volumes:
- ${SAFELINE_DIR}/resources/postgres/data:/var/lib/postgresql/data
- /etc/localtime:/etc/localtime:ro
environment:
- POSTGRES_USER=safeline-ce
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:?postgres password required}
networks:
safeline_net:
aliases:
- safeline-postgres
command: [postgres, -c, max_connections=600]
healthcheck:
test: pg_isready -U safeline-ce -d safeline-ce
interval: 10s
timeout: 5s
retries: 10
deploy:
mode: replicated
replicas: 1
restart_policy:
condition: on-failure
update_config:
parallelism: 1
failure_action: rollback
rollback_config:
parallelism: 1
mgt:
image: ${IMAGE_PREFIX}/safeline-mgt-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG:?image tag required}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
- ${SAFELINE_DIR}/resources/mgt:/app/data
- ${SAFELINE_DIR}/logs/nginx:/app/log/nginx:z
- ${SAFELINE_DIR}/resources/sock:/app/sock
- /var/run:/app/run
ports:
- ${MGT_PORT:-9443}:1443
environment:
- MGT_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@tasks.postgres/safeline-ce?sslmode=disable
networks:
safeline_net:
aliases:
- safeline-mgt
safeline-detector:
image: ${IMAGE_PREFIX}/safeline-detector-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: replicated
replicas: 1
restart_policy:
condition: on-failure
update_config:
parallelism: 1
failure_action: rollback
rollback_config:
parallelism: 1
healthcheck:
test: ["CMD", "sh", "-c", "[ -S /resources/detector/snserver.sock ]"]
interval: 30s
timeout: 10s
retries: 10
start_period: 10s
volumes:
- ${SAFELINE_DIR}/resources/detector:/resources/detector
- ${SAFELINE_DIR}/logs/detector:/logs/detector
- /etc/localtime:/etc/localtime:ro
environment:
- LOG_DIR=/logs/detector
networks:
safeline_net:
aliases:
- safeline-detector
tengine:
image: ${IMAGE_PREFIX}/safeline-tengine-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
ports:
- 80:80
- 443:443
volumes:
- /etc/localtime:/etc/localtime:ro
- /etc/resolv.conf:/etc/resolv.conf:ro
- ${SAFELINE_DIR}/resources/nginx:/etc/nginx
- ${SAFELINE_DIR}/resources/detector:/resources/detector
- ${SAFELINE_DIR}/resources/chaos:/resources/chaos
- ${SAFELINE_DIR}/logs/nginx:/var/log/nginx:z
- ${SAFELINE_DIR}/resources/cache:/usr/local/nginx/cache
- ${SAFELINE_DIR}/resources/sock:/app/sock
environment:
- TCD_MGT_API=https://mgt:1443/api/open/publish/server
- TCD_SNSERVER=safeline-detector:8000
ulimits:
nofile: 131072
networks:
safeline_net:
aliases:
- safeline-tengine
proxy_net:
luigi:
image: ${IMAGE_PREFIX}/safeline-luigi-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
- ${SAFELINE_DIR}/resources/luigi:/app/data
environment:
- LUIGI_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@tasks.postgres/safeline-ce?sslmode=disable
networks:
safeline_net:
aliases:
- safeline-luigi
fvm:
image: ${IMAGE_PREFIX}/safeline-fvm-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
networks:
safeline_net:
aliases:
- safeline-fvm
chaos:
image: ${IMAGE_PREFIX}/safeline-chaos-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- ${SAFELINE_DIR}/resources/sock:/app/sock
- ${SAFELINE_DIR}/resources/chaos:/app/chaos
networks:
safeline_net:
aliases:
- safeline-chaos
The .env file is almost the same as usual except that the variable SAFELINE_DIR is set to a ceph folder managed through proxmox so that every host always has the same data.
I run only one service of postgres and the detector because I got errors regarding sock files that couldn't be opened because one service had them already open.
Then I deploy the stack using export $(cat .env) > /dev/null 2>&1; docker stack deploy -c compose.yaml safeline -d
Here are snippets of my logs from luigi and mgt services:
Luigi:
safeline_luigi.0.3n9tdzrpzzex@pve02 | [error] failed to initialize database, got error failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.3n9tdzrpzzex@pve02 | panic: init db failed: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.3n9tdzrpzzex@pve02 |
safeline_luigi.0.3n9tdzrpzzex@pve02 | goroutine 1 [running]:
safeline_luigi.0.3n9tdzrpzzex@pve02 | luigi/pkg/pg.Init({0xc00003c009?, 0x8?})
safeline_luigi.0.3n9tdzrpzzex@pve02 | /work/pkg/pg/db.go:58 +0x89
safeline_luigi.0.3n9tdzrpzzex@pve02 | luigi/pkg/pg.init.0()
safeline_luigi.0.3n9tdzrpzzex@pve02 | /work/pkg/pg/db.go:77 +0xc5
safeline_luigi.0.mevmm7r7lbzc@pve02 |
safeline_luigi.0.mevmm7r7lbzc@pve02 | 2024/11/30 15:06:12 /work/pkg/pg/db.go:19
safeline_luigi.0.mevmm7r7lbzc@pve02 | [error] failed to initialize database, got error failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.mevmm7r7lbzc@pve02 | panic: init db failed: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.mevmm7r7lbzc@pve02 |
safeline_luigi.0.mevmm7r7lbzc@pve02 | goroutine 1 [running]:
safeline_luigi.0.mevmm7r7lbzc@pve02 | luigi/pkg/pg.Init({0xc00003c009?, 0x8?})
safeline_luigi.0.kidzk789je37@pve02 | 2024/11/30 15:06:30 [INFO] cmd/main.go:88 🌍 Server ready at :80
safeline_luigi.0.kidzk789je37@pve02 |
safeline_luigi.0.kidzk789je37@pve02 | 2024/11/30 15:08:20 /work/pkg/pg/model/qps.go:25 SLOW SQL >= 200ms
safeline_luigi.0.kidzk789je37@pve02 | [227.135ms] [rows:1] INSERT INTO "qps" ("time","points") VALUES ('2024-11-30 15:08:20.011','{}') RETURNING "id"
safeline_luigi.0.mevmm7r7lbzc@pve02 | /work/pkg/pg/db.go:58 +0x89
safeline_luigi.0.mevmm7r7lbzc@pve02 | luigi/pkg/pg.init.0()
safeline_luigi.0.mevmm7r7lbzc@pve02 | /work/pkg/pg/db.go:77 +0xc5
mgt:
safeline_mgt.0.sp2a924st8tf@pve02 | panic: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_mgt.0.sp2a924st8tf@pve02 |
safeline_mgt.0.sp2a924st8tf@pve02 | goroutine 1 [running]:
safeline_mgt.0.sp2a924st8tf@pve02 | main.main()
safeline_mgt.0.sp2a924st8tf@pve02 | /work/main.go:32 +0x197
Hopefully someone knows how to workaround this problem. Thank you for help.
Edit: I forgot to mention that I tried to ping the postgres service from every luigi container on every host using ping postgres after installing iputils-ping inside the container. On every container I got an answer from postgres without problems.
s3tupw1zard
Updated a year ago
0
Nevermind. I didn't notice the time stamps on the log messages. At the beginning, when the Postgres service isn't running yet, for example, the other services that depend on it throw errors. As soon as everything starts normally, it works. I tried to remove the error messages using healthchecks, but couldn't get them to run correctly.
This is the compose.yaml with the correct hostname for postgres in case anyone wants to deploy Safeline in Docker Swarm.
networks:
safeline_net:
driver: overlay
attachable: true
ipam:
config:
- subnet: ${SUBNET_PREFIX}.0/24
proxy_net:
driver: overlay
attachable: true
services:
postgres:
image: ${IMAGE_PREFIX}/safeline-postgres${ARCH_SUFFIX}:15.2
hostname: postgres
volumes:
- ${SAFELINE_DIR}/resources/postgres/data:/var/lib/postgresql/data
- /etc/localtime:/etc/localtime:ro
environment:
- POSTGRES_USER=safeline-ce
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:?postgres password required}
networks:
safeline_net:
aliases:
- postgres
command: [postgres, -c, max_connections=600]
healthcheck:
test: pg_isready -U safeline-ce -d safeline-ce
interval: 10s
timeout: 5s
retries: 10
deploy:
mode: replicated
replicas: 1
restart_policy:
condition: on-failure
update_config:
parallelism: 1
failure_action: rollback
rollback_config:
parallelism: 1
mgt:
image: ${IMAGE_PREFIX}/safeline-mgt-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG:?image tag required}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
- ${SAFELINE_DIR}/resources/mgt:/app/data
- ${SAFELINE_DIR}/logs/nginx:/app/log/nginx:z
- ${SAFELINE_DIR}/resources/sock:/app/sock
- /var/run:/app/run
ports:
- ${MGT_PORT:-9443}:1443
environment:
- MGT_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@postgres/safeline-ce?sslmode=disable
networks:
safeline_net:
aliases:
- safeline-mgt
detect:
image: ${IMAGE_PREFIX}/safeline-detector-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: replicated
replicas: 1
restart_policy:
condition: on-failure
update_config:
parallelism: 1
failure_action: rollback
rollback_config:
parallelism: 1
healthcheck:
test: ["CMD", "sh", "-c", "[ -S /resources/detector/snserver.sock ]"]
interval: 30s
timeout: 10s
retries: 10
start_period: 10s
volumes:
- ${SAFELINE_DIR}/resources/detector:/resources/detector
- ${SAFELINE_DIR}/logs/detector:/logs/detector
- /etc/localtime:/etc/localtime:ro
environment:
- LOG_DIR=/logs/detector
networks:
safeline_net:
aliases:
- safeline-detector
tengine:
image: ${IMAGE_PREFIX}/safeline-tengine-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
ports:
- 80:80
- 443:443
volumes:
- /etc/localtime:/etc/localtime:ro
- /etc/resolv.conf:/etc/resolv.conf:ro
- ${SAFELINE_DIR}/resources/nginx:/etc/nginx
- ${SAFELINE_DIR}/resources/detector:/resources/detector
- ${SAFELINE_DIR}/resources/chaos:/resources/chaos
- ${SAFELINE_DIR}/logs/nginx:/var/log/nginx:z
- ${SAFELINE_DIR}/resources/cache:/usr/local/nginx/cache
- ${SAFELINE_DIR}/resources/sock:/app/sock
environment:
- TCD_MGT_API=https://mgt:1443/api/open/publish/server
- TCD_SNSERVER=safeline-detector:8000
ulimits:
nofile: 131072
networks:
safeline_net:
aliases:
- safeline-tengine
proxy_net:
luigi:
image: ${IMAGE_PREFIX}/safeline-luigi-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
- ${SAFELINE_DIR}/resources/luigi:/app/data
environment:
- LUIGI_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@postgres/safeline-ce?sslmode=disable
networks:
safeline_net:
aliases:
- safeline-luigi
fvm:
image: ${IMAGE_PREFIX}/safeline-fvm-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- /etc/localtime:/etc/localtime:ro
networks:
safeline_net:
aliases:
- safeline-fvm
chaos:
image: ${IMAGE_PREFIX}/safeline-chaos-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
deploy:
mode: global
restart_policy:
condition: on-failure
volumes:
- ${SAFELINE_DIR}/resources/sock:/app/sock
- ${SAFELINE_DIR}/resources/chaos:/app/chaos
networks:
safeline_net:
aliases:
- safeline-chaos
So it was working the whole time and I tried to debug a problem that wasn't a problem for a few days.