用 Docker swarm 快速部署分佈式圖數據庫 Nebula Graph 集群

本文作者系：視野金服工程師｜吳海勝
首發於 Nebula Graph 論壇：https://discuss.nebula-graph.com.cn/t/topic/1388

一、前言

本文介紹如何使用 Docker Swarm 來部署 Nebula Graph 集群，並部署客戶端負載均衡和高可用。

二、nebula 集群搭建

2.1 環境準備

機器準備

| ip | 內存(Gb) | cpu(核數) |
| --- | --- | --- |
| 192.168.1.166 | 16 | 4 |
| 192.168.1.167 | 16 | 4 |
| 192.168.1.168 | 16 | 4 |

在安裝前確保所有機器已安裝 Docker

2.2 初始化 swarm 集群

在 192.168.1.166 機器上執行

$ docker swarm init --advertise-addr 192.168.1.166
Swarm initialized: current node (dxn1zf6l61qsb1josjja83ngz) is now a manager.
To add a worker to this swarm, run the following command:
 docker swarm join \
 --token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
 192.168.1.166:2377
 
To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.

2.3 加入 worker 節點

根據 init 命令提示內容，加入 swarm worker 節點，在 192.168.1.167 192.168.1.168 分別執行

docker swarm join \
 --token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
 192.168.1.166:2377

2.4 驗證集群

docker node ls
 
ID                            HOSTNAME            STATUS              AVAILABILITY        MANAGER STATUS      ENGINE VERSION
h0az2wzqetpwhl9ybu76yxaen *   KF2-DATA-166        Ready               Active              Reachable           18.06.1-ce
q6jripaolxsl7xqv3cmv5pxji     KF2-DATA-167        Ready               Active              Leader              18.06.1-ce
h1iql1uvm7123h3gon9so69dy     KF2-DATA-168        Ready               Active                                  18.06.1-ce

2.5 配置 docker stack

vi docker-stack.yml

配置如下內容

version: '3.6'
services:
  metad0:
    image: vesoft/nebula-metad:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.166
      - --ws_ip=192.168.1.166
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:11000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 11000
        published: 11000
        protocol: tcp
        mode: host
      - target: 11002
        published: 11002
        protocol: tcp
        mode: host
      - target: 45500
        published: 45500
        protocol: tcp
        mode: host
    volumes:
      - data-metad0:/data/meta
      - logs-metad0:/logs
    networks:
      - nebula-net

  metad1:
    image: vesoft/nebula-metad:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.167
      - --ws_ip=192.168.1.167
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:11000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 11000
        published: 11000
        protocol: tcp
        mode: host
      - target: 11002
        published: 11002
        protocol: tcp
        mode: host
      - target: 45500
        published: 45500
        protocol: tcp
        mode: host
    volumes:
      - data-metad1:/data/meta
      - logs-metad1:/logs
    networks:
      - nebula-net

  metad2:
    image: vesoft/nebula-metad:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.168
      - --ws_ip=192.168.1.168
      - --port=45500
      - --data_path=/data/meta
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:11000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 11000
        published: 11000
        protocol: tcp
        mode: host
      - target: 11002
        published: 11002
        protocol: tcp
        mode: host
      - target: 45500
        published: 45500
        protocol: tcp
        mode: host
    volumes:
      - data-metad2:/data/meta
      - logs-metad2:/logs
    networks:
      - nebula-net

  storaged0:
    image: vesoft/nebula-storaged:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.166
      - --ws_ip=192.168.1.166
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:12000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12000
        protocol: tcp
        mode: host
      - target: 12002
        published: 12002
        protocol: tcp
        mode: host
    volumes:
      - data-storaged0:/data/storage
      - logs-storaged0:/logs
    networks:
      - nebula-net
  storaged1:
    image: vesoft/nebula-storaged:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.167
      - --ws_ip=192.168.1.167
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:12000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12000
        protocol: tcp
        mode: host
      - target: 12002
        published: 12004
        protocol: tcp
        mode: host
    volumes:
      - data-storaged1:/data/storage
      - logs-storaged1:/logs
    networks:
      - nebula-net

  storaged2:
    image: vesoft/nebula-storaged:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --local_ip=192.168.1.168
      - --ws_ip=192.168.1.168
      - --port=44500
      - --data_path=/data/storage
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:12000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 12000
        published: 12000
        protocol: tcp
        mode: host
      - target: 12002
        published: 12006
        protocol: tcp
        mode: host
    volumes:
      - data-storaged2:/data/storage
      - logs-storaged2:/logs
    networks:
      - nebula-net
  graphd1:
    image: vesoft/nebula-graphd:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --port=3699
      - --ws_ip=192.168.1.166
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-166
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.166:13000/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3699
        protocol: tcp
        mode: host
      - target: 13000
        published: 13000
        protocol: tcp
#        mode: host
      - target: 13002
        published: 13002
        protocol: tcp
        mode: host
    volumes:
      - logs-graphd:/logs
    networks:
      - nebula-net

  graphd2:
    image: vesoft/nebula-graphd:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --port=3699
      - --ws_ip=192.168.1.167
      - --log_dir=/logs
      - --v=2
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-167
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.167:13001/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3640
        protocol: tcp
        mode: host
      - target: 13000
        published: 13001
        protocol: tcp
        mode: host
      - target: 13002
        published: 13003
        protocol: tcp
#        mode: host
    volumes:
      - logs-graphd2:/logs
    networks:
      - nebula-net
  graphd3:
    image: vesoft/nebula-graphd:nightly
    env_file:
      - ./nebula.env
    command:
      - --meta_server_addrs=192.168.1.166:45500,192.168.1.167:45500,192.168.1.168:45500
      - --port=3699
      - --ws_ip=192.168.1.168
      - --log_dir=/logs
      - --v=0
      - --minloglevel=2
    deploy:
      replicas: 1
      restart_policy:
        condition: on-failure
      placement:
        constraints:
          - node.hostname == KF2-DATA-168
    depends_on:
      - metad0
      - metad1
      - metad2
    healthcheck:
      test: ["CMD", "curl", "-f", "http://192.168.1.168:13002/status"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s
    ports:
      - target: 3699
        published: 3641
        protocol: tcp
        mode: host
      - target: 13000
        published: 13002
        protocol: tcp
#        mode: host
      - target: 13002
        published: 13004
        protocol: tcp
        mode: host
    volumes:
      - logs-graphd3:/logs
    networks:
      - nebula-net
networks:
  nebula-net:
    external: true
    attachable: true
    name: host
volumes:
  data-metad0:
  logs-metad0:
  data-metad1:
  logs-metad1:
  data-metad2:
  logs-metad2:
  data-storaged0:
  logs-storaged0:
  data-storaged1:
  logs-storaged1:
  data-storaged2:
  logs-storaged2:
  logs-graphd:
  logs-graphd2:
  logs-graphd3:

編輯 nebula.env，加入如下內容

TZ=UTC
USER=root

2.6 啟動 nebula 集群

docker stack deploy nebula -c docker-stack.yml

三、集群負載均衡及高可用配置

Nebula Graph 的客戶端目前（1.X）沒有提供負載均衡的能力，只是隨機選一個 graphd 去連接。所以生產使用的時候要自己做個負載均衡和高可用。

圖 3.1

將整個部署架構分為三層，數據服務層，負載均衡層及高可用層。如圖 3.1 所示

負載均衡層：對 client 請求做負載均衡，將請求分發至下方數據服務層

高可用層: 這裡實現的是 haproxy 的高可用，保證負載均衡層的服務從而保證整個集群的正常服務

3.1 負載均衡配置

haproxy 使用 docker-compose 配置。分別編輯以下三個文件

Dockerfile 加入以下內容

FROM haproxy:1.7
COPY haproxy.cfg /usr/local/etc/haproxy/haproxy.cfg
EXPOSE 3640

docker-compose.yml 加入以下內容

version: "3.2"
services:
  haproxy:
    container_name: haproxy
    build: .
    volumes:
      - ./haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg
    ports:
      - 3640:3640
    restart: always
    networks:
      - app_net
networks:
  app_net:
    external: true

haproxy.cfg 加入以下內容

global
    daemon
    maxconn 30000
    log 127.0.0.1 local0 info
    log 127.0.0.1 local1 warning

defaults
    log-format %hr\ %ST\ %B\ %Ts
    log  global
    mode http
    option http-keep-alive
    timeout connect 5000ms
    timeout client 10000ms
    timeout server 50000ms
    timeout http-request 20000ms

# custom your own frontends && backends && listen conf
# CUSTOM

listen graphd-cluster
    bind *:3640
    mode tcp
    maxconn 300
    balance roundrobin
    server server1 192.168.1.166:3699 maxconn 300 check
    server server2 192.168.1.167:3699 maxconn 300 check
    server server3 192.168.1.168:3699 maxconn 300 check

listen stats
    bind *:1080
    stats refresh 30s
    stats uri /stats

3.2 啟動 haproxy

docker-compose up -d

3.3 高可用配置

注：配置 keepalive 需預先準備好 vip（虛擬 ip），在以下配置中 192.168.1.99 便為虛擬 ip

在 192.168.1.166 、192.168.1.167、192.168.1.168上均做以下配置

安裝 keepalived

apt-get update && apt-get upgrade && apt-get install keepalived -y

更改 keepalived配置文件 /etc/keepalived/keepalived.conf（三臺機器中做如下配置，priority 應設置不同值確定優先級）

192.168.1.166 機器配置

global_defs {
    router_id lb01 # 標識信息，一個名字而已；
}
vrrp_script chk_haproxy {
    script "killall -0 haproxy"    interval 2
}
vrrp_instance VI_1 {
    state MASTER
    interface ens160
    virtual_router_id 52
    priority 999
    # 設定 MASTER 與 BACKUP 負載均衡器之間同步檢查的時間間隔，單位是秒
    advert_int 1
    # 設置驗證類型和密碼
    authentication {
    # 設置驗證類型，主要有 PASS 和 AH 兩種
        auth_type PASS
    # 設置驗證密碼，在同一個 vrrp_instance 下，MASTER 與 BACKUP 必須使用相同的密碼才能正常通信
        auth_pass amber1
    }
    virtual_ipaddress {
        # 虛擬 IP 為 192.168.1.99/24; 綁定接口為 ens160; 別名 ens169:1，主備相同
        192.168.1.99/24 dev ens160 label ens160:1
    }
    track_script {
        chk_haproxy
    }
}

167 機器配置

global_defs {
    router_id lb01 # 標識信息，一個名字而已；
}
vrrp_script chk_haproxy {
    script "killall -0 haproxy"    interval 2
}
vrrp_instance VI_1 {
    state BACKUP
    interface ens160
    virtual_router_id 52
    priority 888
    # 設定 MASTER 與 BACKUP 負載均衡器之間同步檢查的時間間隔，單位是秒
    advert_int 1
    # 設置驗證類型和密碼
    authentication {
    # 設置驗證類型，主要有 PASS 和 AH 兩種
        auth_type PASS
    # 設置驗證密碼，在同一個 vrrp_instance 下，MASTER 與 BACKUP 必須使用相同的密碼才能正常通信
        auth_pass amber1
    }
    virtual_ipaddress {
        # 虛擬 IP 為 192.168.1.99/24; 綁定接口為 ens160; 別名 ens160:1，主備相同
        192.168.1.99/24 dev ens160 label ens160:1
    }
    track_script {
        chk_haproxy
    }
}

168 機器配置

global_defs {
    router_id lb01 # 標識信息，一個名字而已；
}
vrrp_script chk_haproxy {
    script "killall -0 haproxy"    interval 2
}
vrrp_instance VI_1 {
    state BACKUP
    interface ens160
    virtual_router_id 52
    priority 777
    # 設定 MASTER 與 BACKUP 負載均衡器之間同步檢查的時間間隔，單位是秒
    advert_int 1
    # 設置驗證類型和密碼
    authentication {
    # 設置驗證類型，主要有 PASS 和 AH 兩種
        auth_type PASS
    # 設置驗證密碼，在同一個 vrrp_instance 下，MASTER 與 BACKUP 必須使用相同的密碼才能正常通信
        auth_pass amber1
    }
    virtual_ipaddress {
        # 虛擬 IP 為 192.168.1.99/24;綁定接口為 ens160; 別名 ens160:1，主備相同
        192.168.1.99/24 dev ens160 label ens160:1
    }
    track_script {
        chk_haproxy
    }
}

keepalived 相關命令

# 啟動 keepalived
systemctl start keepalived
# 使 keepalived 開機自啟
systemctl enable keeplived
# 重啟 keepalived
systemctl restart keepalived

四、其他

離線怎麼部署？把鏡像更改為私有鏡像庫就成了，有問題歡迎來勾搭啊。

我的小魚你醒了還認識早晨嗎昨夜你曾經說願夜幕永不開啟

如果你對本文有任何疑問，歡迎來論壇和原作者聊聊~~ 原帖地址：https://discuss.nebula-graph.com.cn/t/topic/1388