Unable to run dgraph in a multi-node kubernetes cluster

Hey @jzhu077

The snapshot error was a bug which I have fixed and pushed to dgraph/dgraph:test image. It will soon also be available on dgraph/dgraph:master image.

Here is a config that I tested. This helps me run 3 Dgraph Zero and 15 Dgraph Server’s with 5x replication. The cluster is up in < 2 mins. Give it a try. Also, not sure if you know but you can check the groups and the nodes in the group by going to http://zero_ip:6080/state. It should show three groups with 5 nodes in each group.

apiVersion: v1
kind: Service
metadata:
  name: dgraph-server-public
  labels:
    app: dgraph-server
spec:
  type: LoadBalancer
  ports:
  - port: 9080
    targetPort: 9080
    name: server-grpc
  - port: 8080
    targetPort: 8080
    name: server-http
  selector:
    app: dgraph-server
---
apiVersion: v1
kind: Service
metadata:
  name: dgraph-zero-public
  labels:
    app: dgraph-zero
spec:
  type: LoadBalancer
  ports:
  - port: 5080
    targetPort: 5080
    name: zero-grpc
  - port: 6080
    targetPort: 6080
    name: zero-http
  selector:
    app: dgraph-zero
---
apiVersion: v1
kind: Service
metadata:
  name: dgraph-ratel
  labels:
    app: dgraph-ratel
spec:
  type: LoadBalancer
  ports:
  - port: 8081
    targetPort: 8081
    name: ratel-http
  selector:
    app: dgraph-ratel
---
# This is a headless service which is neccessary for discovery for a dgraph-zero StatefulSet.
# https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#creating-a-statefulset
apiVersion: v1
kind: Service
metadata:
  name: dgraph-zero
  labels:
    app: dgraph-zero
spec:
  ports:
  - port: 5080
    targetPort: 5080
    name: zero-grpc

  clusterIP: None
  selector:
    app: dgraph-zero
---
# This is a headless service which is neccessary for discovery for a dgraph-server StatefulSet.
# https://kubernetes.io/docs/tutorials/stateful-application/basic-stateful-set/#creating-a-statefulset
apiVersion: v1
kind: Service
metadata:
  name: dgraph-server
  labels:
    app: dgraph-server
spec:
  ports:
  - port: 7080
    targetPort: 7080
    name: grpc

  clusterIP: None
  selector:
    app: dgraph-server
---
# This StatefulSet runs 3 Dgraph Zero's.
apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
  name: dgraph-zero
spec:
  serviceName: "dgraph-zero"
  replicas: 3
  template:
    metadata:
      labels:
        app: dgraph-zero
    spec:
      containers:
      - name: zero
        image: dgraph/dgraph:test
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 5080
          name: intra-node
        volumeMounts:
        - name: datadir
          mountPath: /dgraph
        command:
          - bash
          - "-c"
          - |
            set -ex
            [[ `hostname` =~ -([0-9]+)$ ]] || exit 1
            ordinal=${BASH_REMATCH[1]}
            idx=$(($ordinal + 1))
            if [[ $ordinal -eq 0 ]]; then
              dgraph zero -o -2000 --my=$(hostname -f):5080 --idx $idx --replicas 5
            else
              dgraph zero -o -2000 --my=$(hostname -f):5080 --peer dgraph-zero-0.dgraph-zero.default.svc.cluster.local:5080 --idx $idx --replicas 5
            fi
      terminationGracePeriodSeconds: 60
      volumes:
      - name: datadir
        persistentVolumeClaim:
          claimName: datadir
  updateStrategy:
    type: RollingUpdate
  volumeClaimTemplates:
  - metadata:
      name: datadir
      annotations:
        volume.alpha.kubernetes.io/storage-class: anything
    spec:
      accessModes:
        - "ReadWriteOnce"
      resources:
        requests:
          storage: 5Gi

---
# This StatefulSet runs 15 replicas of Dgraph Server.
apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
  name: dgraph-server
spec:
  serviceName: "dgraph-server"
  replicas: 15
  template:
    metadata:
      labels:
        app: dgraph-server
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - dgraph-server
              topologyKey: kubernetes.io/hostname
      containers:
      - name: server
        image: dgraph/dgraph:test
        imagePullPolicy: IfNotPresent
        ports:
        volumeMounts:
        - name: datadir
          mountPath: /dgraph
        command:
          - bash
          - "-c"
          - |
            set -ex
            dgraph server --my=$(hostname -f):7080 --memory_mb 2048 --zero dgraph-zero-0.dgraph-zero.default.svc.cluster.local:5080
      terminationGracePeriodSeconds: 60
      volumes:
      - name: datadir
        persistentVolumeClaim:
          claimName: datadir
  updateStrategy:
    type: RollingUpdate
  volumeClaimTemplates:
  - metadata:
      name: datadir
      annotations:
        volume.alpha.kubernetes.io/storage-class: anything
    spec:
      accessModes:
        - "ReadWriteOnce"
      resources:
        requests:
          storage: 5Gi
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
  name: dgraph-ratel
  labels:
    app: dgraph-ratel
spec:
  selector:
    matchLabels:
      app: dgraph-ratel
  template:
    metadata:
      labels:
        app: dgraph-ratel
    spec:
      containers:
      - name: ratel
        image: dgraph/dgraph:test
        ports:
        - containerPort: 8081
        command:
          - dgraph-ratel