DiscussionSLA

Safeline running in Docker Swarm

Published a year ago

# Github Discussion
# Q&A

Published a year ago

profile_photo

s3tupw1zard

Updated a year ago

0

Hello,
I adapted the compose.yaml for running it in docker swarm. As soon as I deploy the stack I can see that my luigi and mgt services can't connect to my postgres service using either the hostname postgres or tasks.postgres. Also luigi can't connect to detect service.

I put the bind mounts on ceph managed through proxmox so that every host has anytime the same data.

This is my compose.yaml:

networks:
  safeline_net:
    driver: overlay
    attachable: true
    ipam:
      config:
        - subnet: ${SUBNET_PREFIX}.0/24
  proxy_net:
    driver: overlay
    attachable: true

services:
  postgres:
    image: ${IMAGE_PREFIX}/safeline-postgres${ARCH_SUFFIX}:15.2
    hostname: postgres
    volumes:
      - ${SAFELINE_DIR}/resources/postgres/data:/var/lib/postgresql/data
      - /etc/localtime:/etc/localtime:ro
    environment:
      - POSTGRES_USER=safeline-ce
      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:?postgres password required}
    networks:
      safeline_net:
        aliases:
          - safeline-postgres
    command: [postgres, -c, max_connections=600]
    healthcheck:
      test: pg_isready -U safeline-ce -d safeline-ce
      interval: 10s
      timeout: 5s
      retries: 10
    deploy:
      mode: replicated
      replicas: 1
      restart_policy:
        condition: on-failure
      update_config:
        parallelism: 1
        failure_action: rollback
      rollback_config:
        parallelism: 1


  mgt:
    image: ${IMAGE_PREFIX}/safeline-mgt-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG:?image tag required}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ${SAFELINE_DIR}/resources/mgt:/app/data
      - ${SAFELINE_DIR}/logs/nginx:/app/log/nginx:z
      - ${SAFELINE_DIR}/resources/sock:/app/sock
      - /var/run:/app/run
    ports:
      - ${MGT_PORT:-9443}:1443
    environment:
      - MGT_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@tasks.postgres/safeline-ce?sslmode=disable
    networks:
      safeline_net:
        aliases:
          - safeline-mgt

  safeline-detector:
    image: ${IMAGE_PREFIX}/safeline-detector-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: replicated
      replicas: 1
      restart_policy:
        condition: on-failure
      update_config:
        parallelism: 1
        failure_action: rollback
      rollback_config:
        parallelism: 1
    healthcheck:
      test: ["CMD", "sh", "-c", "[ -S /resources/detector/snserver.sock ]"]
      interval: 30s
      timeout: 10s
      retries: 10
      start_period: 10s
    volumes:
      - ${SAFELINE_DIR}/resources/detector:/resources/detector
      - ${SAFELINE_DIR}/logs/detector:/logs/detector
      - /etc/localtime:/etc/localtime:ro
    environment:
      - LOG_DIR=/logs/detector
    networks:
      safeline_net:
        aliases:
          - safeline-detector

  tengine:
    image: ${IMAGE_PREFIX}/safeline-tengine-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    ports:
      - 80:80
      - 443:443
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /etc/resolv.conf:/etc/resolv.conf:ro
      - ${SAFELINE_DIR}/resources/nginx:/etc/nginx
      - ${SAFELINE_DIR}/resources/detector:/resources/detector
      - ${SAFELINE_DIR}/resources/chaos:/resources/chaos
      - ${SAFELINE_DIR}/logs/nginx:/var/log/nginx:z
      - ${SAFELINE_DIR}/resources/cache:/usr/local/nginx/cache
      - ${SAFELINE_DIR}/resources/sock:/app/sock
    environment:
      - TCD_MGT_API=https://mgt:1443/api/open/publish/server
      - TCD_SNSERVER=safeline-detector:8000
    ulimits:
      nofile: 131072
    networks:
      safeline_net:
        aliases:
          - safeline-tengine
      proxy_net:
        

  luigi:
    image: ${IMAGE_PREFIX}/safeline-luigi-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ${SAFELINE_DIR}/resources/luigi:/app/data
    environment:
      - LUIGI_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@tasks.postgres/safeline-ce?sslmode=disable
    networks:
      safeline_net:
        aliases:
          - safeline-luigi

  fvm:
    image: ${IMAGE_PREFIX}/safeline-fvm-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
    networks:
      safeline_net:
        aliases:
          - safeline-fvm

  chaos:
    image: ${IMAGE_PREFIX}/safeline-chaos-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - ${SAFELINE_DIR}/resources/sock:/app/sock
      - ${SAFELINE_DIR}/resources/chaos:/app/chaos
    networks:
      safeline_net:
        aliases:
          - safeline-chaos

The .env file is almost the same as usual except that the variable SAFELINE_DIR is set to a ceph folder managed through proxmox so that every host always has the same data.

I run only one service of postgres and the detector because I got errors regarding sock files that couldn't be opened because one service had them already open.

Then I deploy the stack using export $(cat .env) > /dev/null 2>&1; docker stack deploy -c compose.yaml safeline -d

Here are snippets of my logs from luigi and mgt services:

Luigi:

safeline_luigi.0.3n9tdzrpzzex@pve02    | [error] failed to initialize database, got error failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.3n9tdzrpzzex@pve02    | panic: init db failed: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.3n9tdzrpzzex@pve02    | 
safeline_luigi.0.3n9tdzrpzzex@pve02    | goroutine 1 [running]:
safeline_luigi.0.3n9tdzrpzzex@pve02    | luigi/pkg/pg.Init({0xc00003c009?, 0x8?})
safeline_luigi.0.3n9tdzrpzzex@pve02    |        /work/pkg/pg/db.go:58 +0x89
safeline_luigi.0.3n9tdzrpzzex@pve02    | luigi/pkg/pg.init.0()
safeline_luigi.0.3n9tdzrpzzex@pve02    |        /work/pkg/pg/db.go:77 +0xc5
safeline_luigi.0.mevmm7r7lbzc@pve02    | 
safeline_luigi.0.mevmm7r7lbzc@pve02    | 2024/11/30 15:06:12 /work/pkg/pg/db.go:19
safeline_luigi.0.mevmm7r7lbzc@pve02    | [error] failed to initialize database, got error failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.mevmm7r7lbzc@pve02    | panic: init db failed: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_luigi.0.mevmm7r7lbzc@pve02    | 
safeline_luigi.0.mevmm7r7lbzc@pve02    | goroutine 1 [running]:
safeline_luigi.0.mevmm7r7lbzc@pve02    | luigi/pkg/pg.Init({0xc00003c009?, 0x8?})
safeline_luigi.0.kidzk789je37@pve02    | 2024/11/30 15:06:30 [INFO] cmd/main.go:88 🌍 Server ready at :80
safeline_luigi.0.kidzk789je37@pve02    | 
safeline_luigi.0.kidzk789je37@pve02    | 2024/11/30 15:08:20 /work/pkg/pg/model/qps.go:25 SLOW SQL >= 200ms
safeline_luigi.0.kidzk789je37@pve02    | [227.135ms] [rows:1] INSERT INTO "qps" ("time","points") VALUES ('2024-11-30 15:08:20.011','{}') RETURNING "id"
safeline_luigi.0.mevmm7r7lbzc@pve02    |        /work/pkg/pg/db.go:58 +0x89
safeline_luigi.0.mevmm7r7lbzc@pve02    | luigi/pkg/pg.init.0()
safeline_luigi.0.mevmm7r7lbzc@pve02    |        /work/pkg/pg/db.go:77 +0xc5

mgt:

safeline_mgt.0.sp2a924st8tf@pve02    | panic: failed to connect to `host=tasks.postgres user=safeline-ce database=safeline-ce`: hostname resolving error (lookup tasks.postgres on 127.0.0.11:53: no such host)
safeline_mgt.0.sp2a924st8tf@pve02    | 
safeline_mgt.0.sp2a924st8tf@pve02    | goroutine 1 [running]:
safeline_mgt.0.sp2a924st8tf@pve02    | main.main()
safeline_mgt.0.sp2a924st8tf@pve02    |  /work/main.go:32 +0x197

Hopefully someone knows how to workaround this problem. Thank you for help.

Edit: I forgot to mention that I tried to ping the postgres service from every luigi container on every host using ping postgres after installing iputils-ping inside the container. On every container I got an answer from postgres without problems.

profile_photo

s3tupw1zard

Updated a year ago

0

Nevermind. I didn't notice the time stamps on the log messages. At the beginning, when the Postgres service isn't running yet, for example, the other services that depend on it throw errors. As soon as everything starts normally, it works. I tried to remove the error messages using healthchecks, but couldn't get them to run correctly.

This is the compose.yaml with the correct hostname for postgres in case anyone wants to deploy Safeline in Docker Swarm.

networks:
  safeline_net:
    driver: overlay
    attachable: true
    ipam:
      config:
        - subnet: ${SUBNET_PREFIX}.0/24
  proxy_net:
    driver: overlay
    attachable: true

services:
  postgres:
    image: ${IMAGE_PREFIX}/safeline-postgres${ARCH_SUFFIX}:15.2
    hostname: postgres
    volumes:
      - ${SAFELINE_DIR}/resources/postgres/data:/var/lib/postgresql/data
      - /etc/localtime:/etc/localtime:ro
    environment:
      - POSTGRES_USER=safeline-ce
      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:?postgres password required}
    networks:
      safeline_net:
        aliases:
          - postgres
    command: [postgres, -c, max_connections=600]
    healthcheck:
      test: pg_isready -U safeline-ce -d safeline-ce
      interval: 10s
      timeout: 5s
      retries: 10
    deploy:
      mode: replicated
      replicas: 1
      restart_policy:
        condition: on-failure
      update_config:
        parallelism: 1
        failure_action: rollback
      rollback_config:
        parallelism: 1


  mgt:
    image: ${IMAGE_PREFIX}/safeline-mgt-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG:?image tag required}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ${SAFELINE_DIR}/resources/mgt:/app/data
      - ${SAFELINE_DIR}/logs/nginx:/app/log/nginx:z
      - ${SAFELINE_DIR}/resources/sock:/app/sock
      - /var/run:/app/run
    ports:
      - ${MGT_PORT:-9443}:1443
    environment:
      - MGT_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@postgres/safeline-ce?sslmode=disable
    networks:
      safeline_net:
        aliases:
          - safeline-mgt

  detect:
    image: ${IMAGE_PREFIX}/safeline-detector-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: replicated
      replicas: 1
      restart_policy:
        condition: on-failure
      update_config:
        parallelism: 1
        failure_action: rollback
      rollback_config:
        parallelism: 1
    healthcheck:
      test: ["CMD", "sh", "-c", "[ -S /resources/detector/snserver.sock ]"]
      interval: 30s
      timeout: 10s
      retries: 10
      start_period: 10s
    volumes:
      - ${SAFELINE_DIR}/resources/detector:/resources/detector
      - ${SAFELINE_DIR}/logs/detector:/logs/detector
      - /etc/localtime:/etc/localtime:ro
    environment:
      - LOG_DIR=/logs/detector
    networks:
      safeline_net:
        aliases:
          - safeline-detector

  tengine:
    image: ${IMAGE_PREFIX}/safeline-tengine-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    ports:
      - 80:80
      - 443:443
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - /etc/resolv.conf:/etc/resolv.conf:ro
      - ${SAFELINE_DIR}/resources/nginx:/etc/nginx
      - ${SAFELINE_DIR}/resources/detector:/resources/detector
      - ${SAFELINE_DIR}/resources/chaos:/resources/chaos
      - ${SAFELINE_DIR}/logs/nginx:/var/log/nginx:z
      - ${SAFELINE_DIR}/resources/cache:/usr/local/nginx/cache
      - ${SAFELINE_DIR}/resources/sock:/app/sock
    environment:
      - TCD_MGT_API=https://mgt:1443/api/open/publish/server
      - TCD_SNSERVER=safeline-detector:8000
    ulimits:
      nofile: 131072
    networks:
      safeline_net:
        aliases:
          - safeline-tengine
      proxy_net:
        

  luigi:
    image: ${IMAGE_PREFIX}/safeline-luigi-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ${SAFELINE_DIR}/resources/luigi:/app/data
    environment:
      - LUIGI_PG=postgres://safeline-ce:${POSTGRES_PASSWORD}@postgres/safeline-ce?sslmode=disable
    networks:
      safeline_net:
        aliases:
          - safeline-luigi

  fvm:
    image: ${IMAGE_PREFIX}/safeline-fvm-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - /etc/localtime:/etc/localtime:ro
    networks:
      safeline_net:
        aliases:
          - safeline-fvm

  chaos:
    image: ${IMAGE_PREFIX}/safeline-chaos-g${ARCH_SUFFIX}${RELEASE}:${IMAGE_TAG}
    deploy:
      mode: global
      restart_policy:
        condition: on-failure
    volumes:
      - ${SAFELINE_DIR}/resources/sock:/app/sock
      - ${SAFELINE_DIR}/resources/chaos:/app/chaos
    networks:
      safeline_net:
        aliases:
          - safeline-chaos

So it was working the whole time and I tried to debug a problem that wasn't a problem for a few days.