Vincent Tech Blog

Wednesday, January 24, 2018

Kubernetes All-in-one node with BIGIP k8s controller

Environment:

K8S All in one node running on a VM with two interfaces, ens3 for management, ens7 for k8s node/pod network

(use default simple docker bridge network for node/pod so no extra flannel/calico network setup)

vincent@Kubernete-Dev:~/kubernetes$ ip addr show dev ens3

2: ens3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
    link/ether 52:54:00:ac:0c:27 brd ff:ff:ff:ff:ff:ff
    inet 192.168.48.68/24 brd 192.168.48.255 scope global ens3
       valid_lft forever preferred_lft forever
    inet6 fe80::5054:ff:feac:c27/64 scope link
       valid_lft forever preferred_lft forever

vincent@Kubernete-Dev:~/kubernetes$ ip addr show ens7
3: ens7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
    link/ether 52:54:00:cf:3c:e0 brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.1/24 brd 192.168.1.255 scope global ens7
       valid_lft forever preferred_lft forever
    inet6 fe80::5054:ff:fecf:3ce0/64 scope link
       valid_lft forever preferred_lft forever

BIGIP VE VM
[root@bigip-kvm:Active:Standalone] config # tmsh list net self
net self external {
    address 192.168.1.2/24
    allow-service all
    traffic-group traffic-group-local-only
    vlan external
}


 follow link below to setup All-in-one K8S, replace all 127.0.0.1 with 192.168.1.1 in hack/local-up-cluster.sh

https://github.com/kubernetes/community/blob/master/contributors/devel/running-locally.md
**KUBEPROXY_MODE=ipvs is new kube proxy mode, performance is better than iptables, but require kernel ip_vs/ip_vs_rr/ip_vs_sh/ip_vs_wrr module**
**remove KUBEPROXY_MODE=ipvs environment variable to just use iptables for easy setup**
** hack/local-up-cluster.sh diff **
vincent@Kubernete-Dev:~/kubernetes$ git diff
diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh
index 98a0a6b..be143af 100755
--- a/hack/local-up-cluster.sh
+++ b/hack/local-up-cluster.sh
@@ -64,7 +64,7 @@ DNS_DOMAIN=${KUBE_DNS_NAME:-"cluster.local"}
 KUBECTL=${KUBECTL:-cluster/kubectl.sh}
 WAIT_FOR_URL_API_SERVER=${WAIT_FOR_URL_API_SERVER:-20}
 ENABLE_DAEMON=${ENABLE_DAEMON:-false}
-HOSTNAME_OVERRIDE=${HOSTNAME_OVERRIDE:-"127.0.0.1"}
+HOSTNAME_OVERRIDE=${HOSTNAME_OVERRIDE:-"192.168.1.1"}
 CLOUD_PROVIDER=${CLOUD_PROVIDER:-""}
 CLOUD_CONFIG=${CLOUD_CONFIG:-""}
 FEATURE_GATES=${FEATURE_GATES:-"AllAlpha=false"}
@@ -218,12 +218,12 @@ API_SECURE_PORT=${API_SECURE_PORT:-6443}

 # WARNING: For DNS to work on most setups you should export API_HOST as the docker0 ip address,
 API_HOST=${API_HOST:-localhost}
-API_HOST_IP=${API_HOST_IP:-"127.0.0.1"}
+API_HOST_IP=${API_HOST_IP:-"192.168.1.1"}
 ADVERTISE_ADDRESS=${ADVERTISE_ADDRESS:-""}
 API_BIND_ADDR=${API_BIND_ADDR:-"0.0.0.0"}
 EXTERNAL_HOSTNAME=${EXTERNAL_HOSTNAME:-localhost}

-KUBELET_HOST=${KUBELET_HOST:-"127.0.0.1"}
+KUBELET_HOST=${KUBELET_HOST:-"192.168.1.1"}
 # By default only allow CORS for requests on localhost
 API_CORS_ALLOWED_ORIGINS=${API_CORS_ALLOWED_ORIGINS:-/127.0.0.1(:[0-9]+)?$,/localhost(:[0-9]+)?$}
 KUBELET_PORT=${KUBELET_PORT:-10250}
@@ -753,7 +753,7 @@ function start_kubelet {
         -i \
         --cidfile=$KUBELET_CIDFILE \
         gcr.io/google_containers/kubelet \
-        /kubelet --v=${LOG_LEVEL} --containerized ${priv_arg}--chaos-chance="${CHAOS_CHANCE}" --pod-manifest-path="${POD_MANIFEST_PATH}" --hostname-override="${HOSTNAME_OVERRIDE}" --cloud-provider="${CLOUD_PROVIDER}" --cloud-config="${CLOUD_CONFIG}" \ --address="127.0.0.1" --kubeconfig "$CERT_DIR"/kubelet.kubeconfig --port="$KUBELET_PORT"  --enable-controller-attach-detach="${ENABLE_CONTROLLER_ATTACH_DETACH}" &> $KUBELET_LOG &
+        /kubelet --v=${LOG_LEVEL} --containerized ${priv_arg}--chaos-chance="${CHAOS_CHANCE}" --pod-manifest-path="${POD_MANIFEST_PATH}" --hostname-override="${HOSTNAME_OVERRIDE}" --cloud-provider="${CLOUD_PROVIDER}" --cloud-config="${CLOUD_CONFIG}" \ --address="192.168.1.1" --kubeconfig "$CERT_DIR"/kubelet.kubeconfig --port="$KUBELET_PORT"  --enable-controller-attach-detach="${ENABLE_CONTROLLER_ATTACH_DETACH}" &> $KUBELET_LOG &
     fi
 }

1: start k8s

root@Kubernete-Dev:/home/vincent/kubernetes# KUBEPROXY_MODE=ipvs hack/local-up-cluster.sh -O
WARNING : The kubelet is configured to not fail if swap is enabled; production deployments should disable swap.
skipped the build.
WARNING: No swap limit support
Kubelet cgroup driver defaulted to use: cgroupfs
API SERVER insecure port is free, proceeding...
API SERVER secure port is free, proceeding...
Detected host and ready to start services.  Doing some housekeeping first...
Using GO_OUT /home/vincent/kubernetes/_output/bin
Starting services now!
Starting etcd
etcd --advertise-client-urls http://127.0.0.1:2379 --data-dir /tmp/tmp.K1tMwh88m3 --listen-client-urls http://127.0.0.1:2379 --debug > "/dev/null" 2>/dev/null
Waiting for etcd to come up.
+++ [0110 10:54:27] On try 2, etcd: : http://127.0.0.1:2379
{"action":"set","node":{"key":"/_test","value":"","modifiedIndex":4,"createdIndex":4}}
Generating a 2048 bit RSA private key
.................................................................................................................+++
.....................+++
writing new private key to '/var/run/kubernetes/server-ca.key'
-----
Generating a 2048 bit RSA private key
...........+++
Local Kubernetes cluster is running. Press Ctrl-C to shut it down.

Logs:
  /tmp/kube-apiserver.log
  /tmp/kube-controller-manager.log
  /tmp/kube-proxy.log
  /tmp/kube-scheduler.log
  /tmp/kubelet.log

To start using your cluster, you can open up another terminal/tab and run:

  export KUBECONFIG=/var/run/kubernetes/admin.kubeconfig
  cluster/kubectl.sh

Alternatively, you can write to the default kubeconfig:

  export KUBERNETES_PROVIDER=local

  cluster/kubectl.sh config set-cluster local --server=https://localhost:6443 --
  cluster/kubectl.sh config set-credentials myself --client-key=/var/run/kuberne
  cluster/kubectl.sh config set-context local --cluster=local --user=myself
  cluster/kubectl.sh config use-context local
  cluster/kubectl.sh

2: create bigip admin login secret in k8s
vincent@Kubernete-Dev:~/kubernetes$ export KUBECONFIG=/var/run/kubernetes/admin.kubeconfig
vincent@Kubernete-Dev:~/kubernetes$ cluster/kubectl.sh create secret generic bigip-login --namespace kube-system --from-literal=username=admin --from-literal=password=admin
secret "bigip-login" created

3: deployment F5 k8s-bigip-ctlr

vincent@Kubernete-Dev:~/kubernetes$ cluster/kubectl.sh apply -f f5-k8s-bigip-ctlr-deployment.yaml
deployment "test-bigip-controller-1" created
serviceaccount "bigip-ctlr-serviceaccount" created
clusterrole "bigip-ctlr-clusterrole" created
clusterrolebinding "bigip-ctlr-clusterrole-binding" created

4: show the f5-k8s-bigip-ctlr pod
vincent@Kubernete-Dev:~/kubernetes$ cluster/kubectl.sh get po --namespace=kube-system
NAME                                       READY     STATUS    RESTARTS   AGE
kube-dns-6c857864fb-lcxgf                  3/3       Running   0          2m
test-bigip-controller-1-67864586dd-jxq8p   1/1       Running   0          40s

5: show f5-k8s-bigip-ctlr pod logs

vincent@Kubernete-Dev:~/kubernetes$ cluster/kubectl.sh logs test-bigip-controller-1-67864586dd-jxq8p --namespace=kube-system | tail -10
2018/01/11 22:51:03 [INFO] [2018-01-11 22:51:03,010 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:05 [INFO] [2018-01-11 22:51:05,092 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:07 [INFO] [2018-01-11 22:51:07,109 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:09 [INFO] [2018-01-11 22:51:09,109 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:11 [INFO] [2018-01-11 22:51:11,127 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:13 [INFO] [2018-01-11 22:51:13,117 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:15 [INFO] [2018-01-11 22:51:15,209 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:17 [INFO] [2018-01-11 22:51:17,210 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:19 [INFO] [2018-01-11 22:51:19,243 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/11 22:51:21 [INFO] [2018-01-11 22:51:21,408 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule

6: f5-k8s-bigip-ctlr containers runs two processes
 /app/bin/k8s-bigip-ctlr written in Golang to pull configs from k8s
/app/python/bigipconfigdriver.py in python to push config to BIGIP ( replace it with https://github.com/e-XpertSolutions/f5-rest-client ?)
vincent@Kubernete-Dev:~/kubernetes$ cluster/kubectl.sh exec -it  test-bigip-controller-1-67864586dd-jxq8p --namespace=kube-system -- /bin/sh
/app $ ps -ef
PID   USER     TIME   COMMAND
    1 ctlr       0:01 /app/bin/k8s-bigip-ctlr --bigip-partition test --bigip-url 192.168.1.2 --bigip-username admin --bigip-password admin --verify-interval 2 --namespace
   14 ctlr       0:10 python /app/python/bigipconfigdriver.py --config-file /tmp/k8s-bigip-ctlr.config450095162/config.json

/app $ ip addr show
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN qlen 1
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
40: eth0@if41: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue state UP
    link/ether 02:42:ac:11:00:03 brd ff:ff:ff:ff:ff:ff
    inet 172.17.0.3/16 scope global eth0
       valid_lft forever preferred_lft forever

f5-k8s-bigip-ctlr-deployment.yaml

#
# https://github.com/kubernetes/community/blob/master/contributors/devel/running-locally.md
# change kubernetes/hack/local-up-cluster.sh all 127.0.0.1 to private ip 192.168.1.1 that external k8s BIGIP can reach to

# http://clouddocs.f5.com/containers/v2/kubernetes/kctlr-app-install.html
# step 1:
#create BIGIP login secret first
#kubectl create secret generic bigip-login --namespace kube-system --from-literal=username=admin --from-literal=password=admin

# step 2:
# cluster/kubectl.sh apply -f following yaml

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  annotations:
    deployment.kubernetes.io/revision: "1"
  labels:
    app: test-bigip-controller-1
  name: test-bigip-controller-1
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: test-bigip-controller-1
  strategy:
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 1
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: test-bigip-controller-1
    spec:
      serviceAccountName: bigip-ctlr-serviceaccount
      containers:
      - args:
        - --bigip-partition
        - test
        - --bigip-url
        - 192.168.1.2
        - --bigip-username
        - admin
        - --bigip-password
        - admin
        - --verify-interval
        - "2"
        - --namespace
        - default
        - --node-poll-interval
        - "1"
        - --pool-member-type
        - nodeport
        - --log-level
        - INFO
        command:
        - /app/bin/k8s-bigip-ctlr
        image: f5networks/k8s-bigip-ctlr:latest
        imagePullPolicy: Always
        name: test-bigip-controller-1
        resources:
          limits:
            cpu: 100m
            memory: 128M
      schedulerName: default-scheduler
      terminationGracePeriodSeconds: 30

---

apiVersion: v1
kind: ServiceAccount
metadata:
  name: bigip-ctlr-serviceaccount
  namespace: kube-system

---

# for use in k8s clusters using RBAC
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
  name: bigip-ctlr-clusterrole
rules:
- apiGroups:
  - ""
  - "extensions"
  resources:
  - nodes
  - services
  - endpoints
  - namespaces
  - ingresses
  - secrets
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  - "extensions"
  resources:
  - configmaps
  - events
  - ingresses/status
  verbs:
  - get
  - list
  - watch
  - update
  - create
  - patch

---

kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
  name: bigip-ctlr-clusterrole-binding
  namespace: kube-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: bigip-ctlr-clusterrole
subjects:
- kind: ServiceAccount
  name: bigip-ctlr-serviceaccount
  namespace: kube-system




ISSUE

 vincent@Kubernete-Dev:~/kubernetes$  cluster/kubectl.sh logs test-bigip-controller-1-7878d8b849-jdtr5 --namespace=kube-system
2018/01/04 20:56:17 [ERROR] [2018-01-04 20:56:17,650 __main__ ERROR] HTTPSConnectionPool(host='172.16.2.51', port=443): Max retries exceeded with url: /mgmt/shared/authn/login (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7f273380e650>: Failed to establish a new connection: [Errno 110] Operation timed out',))
2018/01/04 20:56:17 [CRITICAL] Config driver exited: 1

https://github.com/F5Networks/k8s-bigip-ctlr/issues/438 Investigate and document behavior when controller cannot communicate with BIG-IP




Kubernetes Multi-Node hack with BIGIP k8s controller


This is a hack around Running All-In-One Kubernetes and BIGIP k8s-bigip-ctlr to run Kubernetes with multiple worker node
hack idea:
1, replace docker network with flannel vxlan network so worker node/pod can communicate with each other in multi-node cluster,
2, etcd by default listen on 127.0.0.1 in All-in-one mode, make it listen on 0.0.0.0 since flannel on each worker node needs to talk to etcd
from kubernetes/hack/lib/etcd.sh, we can set the ETCD_HOST environment variable, thus:
[root@fed-master kubernetes]# ETCD_HOST=0.0.0.0 hack/local-up-cluster.sh -O
Leave the docker and flanneld running on all worker nodes,  when each time to start kubernetes, we need to re-add the flannel network config
and add each worker node because the etcd configuration started by hack/local-up-cluster.sh clean up everything when it quits
the docker0 and flannel link may look like:
Master node:
33: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1350 qdisc noqueue state UNKNOWN group default
    link/ether 6a:ce:e5:ac:a1:f1 brd ff:ff:ff:ff:ff:ff
    inet 18.16.95.0/32 scope global flannel.1
       valid_lft forever preferred_lft forever
    inet6 fe80::68ce:e5ff:feac:a1f1/64 scope link
       valid_lft forever preferred_lft forever
34: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1350 qdisc noqueue state UP group default
    link/ether 02:42:4c:a1:e7:50 brd ff:ff:ff:ff:ff:ff
    inet 18.16.95.1/24 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:4cff:fea1:e750/64 scope link
       valid_lft forever preferred_lft forever
Worker node:
6: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1350 qdisc noqueue state UNKNOWN group default
    link/ether a6:f5:5d:cc:94:3e brd ff:ff:ff:ff:ff:ff
    inet 18.16.100.0/32 scope global flannel.1
       valid_lft forever preferred_lft forever
    inet6 fe80::a4f5:5dff:fecc:943e/64 scope link
       valid_lft forever preferred_lft forever
7: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1350 qdisc noqueue state UP group default
    link/ether 02:42:d9:0c:05:a9 brd ff:ff:ff:ff:ff:ff
    inet 18.16.100.1/24 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:d9ff:fe0c:5a9/64 scope link
       valid_lft forever preferred_lft forever

so the running order is:

1 #ETCD_HOST=0.0.0.0 hack/local-up-cluster.sh -O

2  create the configuartion (note the etcdctl is the one installed hack/install-etcd.sh under kubernetes/third_party/etcd)

[root@fed-master kubernetes]# cat flannel-config.json
{
    "Network": "18.16.0.0/16",
    "SubnetLen": 24,
    "Backend": {
        "Type": "vxlan",
        "VNI": 1
     }
}
[root@fed-master kubernetes]# etcdctl set /coreos.com/network/config < flannel-config.json
{
    "Network": "18.16.0.0/16",
    "SubnetLen": 24,
    "Backend": {
        "Type": "vxlan",
        "VNI": 1
     }
}
[root@fed-master kubernetes]# etcdctl get /coreos.com/network/config
{
    "Network": "18.16.0.0/16",
    "SubnetLen": 24,
    "Backend": {
        "Type": "vxlan",
        "VNI": 1
     }
}

3, add the worker node (Note add fed-node2 in /etc/hosts)


[root@fed-master kubernetes]# cat fed-node2.json
 {
     "apiVersion": "v1",
     "kind": "Node",
     "metadata": {
         "name": "fed-node2",
         "labels":{ "name": "fed-node2-label"}
     },
     "spec": {
         "externalID": "fed-node2"
     }
 }
[root@fed-master kubernetes]# cluster/kubectl.sh get no -o wide
NAME          STATUS    ROLES     AGE       VERSION                     EXTERNAL-IP   OS-IMAGE                 KERNEL-VERSION           CONTAINER-RUNTIME
192.168.1.3   Ready     <none>    4m        v0.0.0-master+$Format:%h$   <none>        Fedora 26 (Twenty Six)   4.11.8-300.fc26.x86_64   docker://1.13.1
fed-node2     Ready     <none>    14s       v1.7.3                      <none>        Fedora 26 (Twenty Six)   4.11.8-300.fc26.x86_64   docker://1.13.1

 

 4 the rest for k8s-bigip-ctrl is same

[root@fed-master kubernetes]# cluster/kubectl.sh create secret generic bigip-login --namespace kube-system --from-literal=username=admin --from-literal=password=admin
secret "bigip-login" created
[root@fed-master kubernetes]# cluster/kubectl.sh apply -f f5-k8s-bigip-ctlr-deployment.yaml
deployment "test-bigip-controller-1" created
serviceaccount "bigip-ctlr-serviceaccount" created
clusterrole "bigip-ctlr-clusterrole" created
clusterrolebinding "bigip-ctlr-clusterrole-binding" created
[root@fed-master kubernetes]# cluster/kubectl.sh get po --namespace=kube-system -o wide
NAME                                       READY     STATUS    RESTARTS   AGE       IP            NODE
kube-dns-774d5484cc-942n2                  3/3       Running   0          10m       18.16.95.2    192.168.1.3
test-bigip-controller-1-67864586dd-p6djt   1/1       Running   0          53s       18.16.100.2   fed-node2  <===as you can see the k8s-bigip-ctrl is deployed in worker node fed-node2
[root@fed-master kubernetes]# cluster/kubectl.sh log test-bigip-controller-1-67864586dd-p6djt --namespace=kube-system | tail -5
W0117 15:12:03.529103   11055 cmd.go:354] log is DEPRECATED and will be removed in a future version. Use logs instead.
2018/01/17 23:11:53 [INFO] [2018-01-17 23:11:53,625 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/17 23:11:55 [INFO] [2018-01-17 23:11:55,811 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/17 23:11:57 [INFO] [2018-01-17 23:11:57,927 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/17 23:11:59 [INFO] [2018-01-17 23:11:59,849 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/17 23:12:01 [INFO] [2018-01-17 23:12:01,940 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule

ISSUE
use flannel is kind of painful, when restarting k8s/docker/flanneld, I can't start k8s because k8s depend on docker, but docker can't start because docker depend flanneld once flanneld is configured to run to support docker. but flanneld can't start  because flanneld depend on etcd which is part of k8s that is started by hack/local-up-cluster.sh, so this is a dead loop
Solution 1:
  not to use flannel, just use docker bridge network in all worker node, docker default network is 172.17.0.1/16, then all worker node docker0 could all have the same 172.17.0.1/16
6: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
    link/ether 02:42:80:bd:79:37 brd ff:ff:ff:ff:ff:ff
    inet 172.17.0.1/16 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:80ff:febd:7937/64 scope link
       valid_lft forever preferred_lft forever

I can workaround this by configure custom docker network:

[root@fed-node2 ~]# cat /etc/sysconfig/docker-network
# /etc/sysconfig/docker-network
DOCKER_NETWORK_OPTIONS="--bip=172.17.2.1/24 --fixed-cidr=172.17.2.0/24"
4: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
    link/ether 02:42:76:e2:9e:60 brd ff:ff:ff:ff:ff:ff
    inet 172.17.2.1/24 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:76ff:fee2:9e60/64 scope link
       valid_lft forever preferred_lft forever

[root@fed-master kubernetes]# cluster/kubectl.sh get po --namespace=kube-system -o wide
NAME                                       READY     STATUS    RESTARTS   AGE       IP           NODE
kube-dns-774d5484cc-f9zkw                  3/3       Running   0          1h        172.17.0.2   192.168.1.3
test-bigip-controller-1-67864586dd-lvrdq   1/1       Running   0          14m       172.17.2.2   fed-node2
 This seems like better quick setup solution

Solution 2: (this is preferred solution)
I found I am able starting up k8s without docker dependency, make a copy of hack/local-up-cluster.sh to hack/multi-node-cluster.sh
and change the hack/multi-node-cluster.sh not depending on docker, now reboot the k8s master/worker node, once k8s started, re-create the flannel vxlan network
flanneld will start ok and re-start docker
#KUBEPROXY_MODE=ipvs ETCD_HOST=0.0.0.0 hack/multi-node-cluster.sh -O

[root@fed-master kubernetes]# diff -u hack/local-up-cluster.sh hack/multi-node-cluster.sh
--- hack/local-up-cluster.sh    2018-01-18 10:48:55.050801098 -0800
+++ hack/multi-node-cluster.sh  2018-01-19 10:43:54.163855014 -0800
@@ -15,6 +15,7 @@
 # limitations under the License.

 KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
+MASTER_IP="192.168.1.3"

 # This command builds and runs a local kubernetes cluster.
 # You may need to run this as root to allow kubelet to open docker's socket,
@@ -64,7 +65,7 @@
 KUBECTL=${KUBECTL:-cluster/kubectl.sh}
 WAIT_FOR_URL_API_SERVER=${WAIT_FOR_URL_API_SERVER:-20}
 ENABLE_DAEMON=${ENABLE_DAEMON:-false}
-HOSTNAME_OVERRIDE=${HOSTNAME_OVERRIDE:-"127.0.0.1"}
+HOSTNAME_OVERRIDE=${HOSTNAME_OVERRIDE:-"${MASTER_IP}"}
 EXTERNAL_CLOUD_PROVIDER=${EXTERNAL_CLOUD_PROVIDER:-false}
 CLOUD_PROVIDER=${CLOUD_PROVIDER:-""}
 CLOUD_CONFIG=${CLOUD_CONFIG:-""}
@@ -219,12 +220,12 @@

 # WARNING: For DNS to work on most setups you should export API_HOST as the docker0 ip address,
 API_HOST=${API_HOST:-localhost}
-API_HOST_IP=${API_HOST_IP:-"127.0.0.1"}
+API_HOST_IP=${API_HOST_IP:-"${MASTER_IP}"}
 ADVERTISE_ADDRESS=${ADVERTISE_ADDRESS:-""}
 API_BIND_ADDR=${API_BIND_ADDR:-"0.0.0.0"}
 EXTERNAL_HOSTNAME=${EXTERNAL_HOSTNAME:-localhost}

-KUBELET_HOST=${KUBELET_HOST:-"127.0.0.1"}
+KUBELET_HOST=${KUBELET_HOST:-"${MASTER_IP}"}
 # By default only allow CORS for requests on localhost
 API_CORS_ALLOWED_ORIGINS=${API_CORS_ALLOWED_ORIGINS:-/127.0.0.1(:[0-9]+)?$,/localhost(:[0-9]+)?$}
 KUBELET_PORT=${KUBELET_PORT:-10250}
@@ -797,7 +798,7 @@
         -i \
         --cidfile=$KUBELET_CIDFILE \
         gcr.io/google_containers/kubelet \
-        /kubelet --v=${LOG_LEVEL} --containerized ${priv_arg}--chaos-chance="${CHAOS_CHANCE}" --pod-manifest-path="${POD_MANIFEST_PATH}" --hostname-override="${HOSTNAME_OVERRIDE}" ${cloud_config_arg} \ --address="127.0.0.1" --kubeconfig "$CERT_DIR"/kubelet.kubeconfig --port="$KUBELET_PORT"  --enable-controller-attach-detach="${ENABLE_CONTROLLER_ATTACH_DETACH}" &> $KUBELET_LOG &
+        /kubelet --v=${LOG_LEVEL} --containerized ${priv_arg}--chaos-chance="${CHAOS_CHANCE}" --pod-manifest-path="${POD_MANIFEST_PATH}" --hostname-override="${HOSTNAME_OVERRIDE}" ${cloud_config_arg} \ --address="${MASTER_IP}" --kubeconfig "$CERT_DIR"/kubelet.kubeconfig --port="$KUBELET_PORT"  --enable-controller-attach-detach="${ENABLE_CONTROLLER_ATTACH_DETACH}" &> $KUBELET_LOG &
     fi
 }

@@ -949,9 +950,9 @@
   kube::etcd::validate
 fi

-if [ "${CONTAINER_RUNTIME}" == "docker" ] && ! kube::util::ensure_docker_daemon_connectivity; then
-  exit 1
-fi
+#if [ "${CONTAINER_RUNTIME}" == "docker" ] && ! kube::util::ensure_docker_daemon_connectivity; then
+#  exit 1
+#fi

 if [[ "${CONTAINER_RUNTIME}" == "rkt" ]]; then
   test_rkt

if running kube-proxy in IPVS mode, since only the master node has IPVS implemented, the worker node kube-proxy still use iptables, there will be connectivity issue to pod running in the worker node, this can be resolved to run flannel in host gateway mode since flannel host gateway mode will setup the route for pods in worker node to use node ip as gateway.
[root@fed-master kubernetes]# cat flannel-config-hostgw.json
{
"Network": "172.16.0.0/16",
"SubnetLen": 24,
"Backend":
  {
    "Type": "host-gw"
  }
}

  #etcdctl rm /coreos.com/network/config
#etcdctl set /coreos.com/network/config < flannel-config-hostgw.json
Master node
[root@fed-master kubernetes]# ip route show
default via 192.168.48.254 dev ens3 proto static metric 100
172.16.56.0/24 via 192.168.1.4 dev ens7
172.16.84.0/24 dev docker0 proto kernel scope link src 172.16.84.1
192.168.1.0/24 dev ens7 proto kernel scope link src 192.168.1.3 metric 100
[root@fed-master kubernetes]# ip addr show dev docker0
23: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
    link/ether 02:42:9e:2f:92:38 brd ff:ff:ff:ff:ff:ff
    inet 172.16.84.1/24 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:9eff:fe2f:9238/64 scope link
       valid_lft forever preferred_lft forever
[root@fed-master kubernetes]# ip addr show dev ens7
3: ens7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc fq_codel state UP group default qlen 1000
    link/ether 52:54:00:66:78:1c brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.3/24 brd 192.168.1.255 scope global ens7
       valid_lft forever preferred_lft forever
    inet6 fe80::2a6a:c357:14a7:55/64 scope link
       valid_lft forever preferred_lft forever
 [root@fed-master kubernetes]# cluster/kubectl.sh get no -o wide
NAME          STATUS    ROLES     AGE       VERSION                     EXTERNAL-IP   OS-IMAGE                 KERNEL-VERSION           CONTAINER-RUNTIME
192.168.1.3   Ready     <none>    36m       v0.0.0-master+$Format:%h$   <none>        Fedora 26 (Twenty Six)   4.11.8-300.fc26.x86_64   docker://1.13.1
fed-node2     Ready     <none>    28m       v1.7.3                      <none>        Fedora 26 (Twenty Six)   4.11.8-300.fc26.x86_64   docker://1.13.1
[root@fed-master kubernetes]# cluster/kubectl.sh get po -o wide
NAME          READY     STATUS    RESTARTS   AGE       IP            NODE
nginx-5vr9t   1/1       Running   0          25m       172.16.56.2   fed-node2
nginx-n8cxl   1/1       Running   0          25m       172.16.84.3   192.168.1.3
[root@fed-master kubernetes]# cluster/kubectl.sh get svc -o wide
NAME           TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)        AGE       SELECTOR
kubernetes     ClusterIP   10.0.0.1     <none>        443/TCP        38m       <none>
nginxservice   NodePort    10.0.0.24    <none>        80:32506/TCP   27m       app=nginx
[root@fed-master kubernetes]# ipvsadm -ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port           Forward Weight ActiveConn InActConn
TCP  127.0.0.1:32506 rr
  -> 172.16.56.2:80               Masq    1      0          0
  -> 172.16.84.3:80               Masq    1      0          0
TCP  172.16.84.1:32506 rr
  -> 172.16.56.2:80               Masq    1      0          0
  -> 172.16.84.3:80               Masq    1      0          0
TCP  192.168.1.3:32506 rr
  -> 172.16.56.2:80               Masq    1      0          0
  -> 172.16.84.3:80               Masq    1      0          0
TCP  192.168.1.5:32506 rr
  -> 172.16.56.2:80               Masq    1      0          0
  -> 172.16.84.3:80               Masq    1      0          0
TCP  10.0.0.24:80 rr
  -> 172.16.56.2:80               Masq    1      0          0
  -> 172.16.84.3:80               Masq    1      0          0

 Worker node
[root@fed-node2 ~]# ip route show
default via 192.168.48.254 dev ens3 proto static metric 100
172.16.56.0/24 dev docker0 proto kernel scope link src 172.16.56.1
172.16.84.0/24 via 192.168.1.3 dev ens7
[root@fed-node2 ~]# ip addr show dev docker0
6: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
    link/ether 02:42:6e:f5:f9:0f brd ff:ff:ff:ff:ff:ff
    inet 172.16.56.1/24 scope global docker0
       valid_lft forever preferred_lft forever
    inet6 fe80::42:6eff:fef5:f90f/64 scope link
       valid_lft forever preferred_lft forever

[root@fed-node2 ~]# ip addr show dev ens7
3: ens7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc fq_codel state UP group default qlen 1000
    link/ether 52:54:00:07:02:8f brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.4/24 brd 192.168.1.255 scope global ens7
       valid_lft forever preferred_lft forever
    inet6 fe80::ba37:e2c5:ccf2:1616/64 scope link
       valid_lft forever preferred_lft forever
    inet6 fe80::2a6a:c357:14a7:55/64 scope link tentative dadfailed
       valid_lft forever preferred_lft forever

Manage BIG-IP virtual servers - Kubernetes

http://clouddocs.f5.com/containers/v2/kubernetes/kctlr-manage-bigip-objects.html
download http://clouddocs.f5.com/containers/v2/_downloads/f5-resource-vs-example.configmap.yaml and modify it accordingly :

1, create k8s nginx pod and nginx node port services:

[root@fed-master kubernetes]# cat nginx_pod.yaml
apiVersion: v1
kind: ReplicationController
metadata:
  name: nginx
spec:
  replicas: 2
  selector:
    app: nginx
  template:
    metadata:
      name: nginx
      labels:
        app: nginx
    spec:
      containers:
      - name: nginx
        image: nginx
        ports:
        - containerPort: 80

[root@fed-master kubernetes]# cluster/kubectl.sh get po -o wide
NAME          READY     STATUS    RESTARTS   AGE       IP            NODE
nginx-27rlx   1/1       Running   0          21h       172.16.14.2   fed-node2
nginx-sl7xw   1/1       Running   0          21h       172.16.79.3   192.168.1.3

[root@fed-master kubernetes]# cat nginx_service.yaml
apiVersion: v1
kind: Service
metadata:
  labels:
    name: nginxservice
  name: nginxservice
spec:
  ports:
    # The port that this service should serve on.
    - port: 80
      nodePort: 32506
  selector:
    app: nginx
  type: NodePort
#  externalIPs:
#    - 192.168.121.66

[root@fed-master kubernetes]# cluster/kubectl.sh get svc -o wide
NAME           TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)        AGE       SELECTOR
kubernetes     ClusterIP   10.0.0.1     <none>        443/TCP        1d        <none>
nginxservice   NodePort    10.0.0.140   <none>        80:32506/TCP   21h       app=nginx

the node port listener created by kube-proxy IPVS
[root@fed-master kubernetes]# ipvsadm -ln
IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port           Forward Weight ActiveConn InActConn

TCP  192.168.1.3:32506 rr
  -> 172.16.14.2:80               Masq    1      0          4
  -> 172.16.79.3:80               Masq    1      0          4

2 modify f5-resource-vs-example.configmap.yaml based on k8s nginxservice


kind: ConfigMap
apiVersion: v1
metadata:
  # name of the resource to create on the BIG-IP
  name: http.vs
  # The namespace to create the object in.
  # The k8s-bigip-ctlr watches all namespaces by default (as of v1.1).
  # If the k8s-bigip-ctlr is watching a specific namespace(s),
  # this setting must match the namespace of the Service you want to proxy
  # -AND- the namespace(s) the k8s-bigip-ctlr watches.
  namespace: default
  labels:
    # tells the k8s-bigip-ctlr to watch this ConfigMap
    f5type: virtual-server
data:
  # NOTE: schema v0.1.4 is required as of k8s-bigip-ctlr v1.3.0
  schema: "f5schemadb://bigip-virtual-server_v0.1.4.json"
  data: |
    {
      "virtualServer": {
        "backend": {
          "servicePort": 80,
          "serviceName": "nginxservice",
          "healthMonitors": [{
            "interval": 30,
            "protocol": "http",
            "send": "GET /\r\n",
            "timeout": 120
          }]
        },
        "frontend": {
          "virtualAddress": {
            "port": 80,
            "bindAddr": "192.168.1.9"
          },
          "partition": "test",
          "balance": "round-robin",
          "mode": "http"
        }
      }
    }
---

kind: ConfigMap
apiVersion: v1
metadata:
  name: https.vs
  labels:
    f5type: virtual-server
data:
  schema: "f5schemadb://bigip-virtual-server_v0.1.4.json"
  data: |
    {
      "virtualServer": {
        "backend": {
          "servicePort": 80,
          "serviceName": "nginxservice",
          "healthMonitors": [{
            "interval": 30,
            "protocol": "http",
            "send": "GET /\r\n",
            "timeout": 120
          }]
        },
        "frontend": {
          "virtualAddress": {
            "port": 443,
            "bindAddr": "192.168.1.9"
          },
          "partition": "test",
          "balance": "round-robin",
          "mode": "http",
          "sslProfile": {
          "f5ProfileName": "Common/clientssl"
          }
        }
      }
    }

#cluster/kubectl.sh create -f f5-resource-vs-example.configmap.yaml

3, verify k8s-bigip-ctrl pod logs

#cluster/kubectl.sh logs test-bigip-controller-1-69cb56f6d7-m4bk2  --namespace=kube-system
2018/01/24 18:43:14 [INFO] ConfigWriter started: 0xc420112210
2018/01/24 18:43:14 [INFO] Started config driver sub-process at pid: 17
2018/01/24 18:43:14 [INFO] NodePoller (0xc42007a7e0) registering new listener: 0x406550
2018/01/24 18:43:15 [INFO] NodePoller started: (0xc42007a7e0)
2018/01/24 18:43:15 [WARNING] Overwriting existing entry for backend {ServiceName:nginxservice ServicePort:80 Namespace:default}
2018/01/24 18:43:16 [WARNING] Overwriting existing entry for backend {ServiceName:nginxservice ServicePort:80 Namespace:default}
2018/01/24 18:43:16 [INFO] Wrote 2 Virtual Server configs
2018/01/24 18:43:19 [INFO] [2018-01-24 18:43:19,273 f5_cccl.bigip INFO] BigIPProxy managed types: /tm/ltm/virtual,/tm/ltm/pool,/tm/ltm/monitor,/tm/sys/application/service
2018/01/24 18:43:21 [INFO] [2018-01-24 18:43:21,274 __main__ INFO] entering inotify loop to watch /tmp/k8s-bigip-ctlr.config738141005/config.json
2018/01/24 18:43:21 [INFO] [2018-01-24 18:43:21,588 f5_cccl.resource.resource INFO] Updating ApiIRule: /test/http_redirect_irule
2018/01/24 18:43:21 [INFO] [2018-01-24 18:43:21,705 f5_cccl.resource.resource INFO] Updating ApiVirtualServer: /test/default_https.vs
2018/01/24 18:43:21 [INFO] [2018-01-24 18:43:21,773 f5_cccl.resource.resource INFO] Updating ApiVirtualServer: /test/default_http.vs
2018/01/24 18:43:45 [WARNING] Overwriting existing entry for backend {ServiceName:nginxservice ServicePort:80 Namespace:default}
2018/01/24 18:43:45 [WARNING] Overwriting existing entry for backend {ServiceName:nginxservice ServicePort:80 Namespace:default}
2018/01/24 18:43:45 [INFO] Wrote 2 Virtual Server configs

4, verify BIGIP LTM virtual configuration

[root@bigip-kvm:Active:Standalone] test # tmsh
root@(bigip-kvm)(cfg-sync Standalone)(Active)(/Common)(tmos)# cd /test
root@(bigip-kvm)(cfg-sync Standalone)(Active)(/test)(tmos)# list ltm virtual
ltm virtual default_http.vs {
    destination 192.168.1.9:http
    ip-protocol tcp
    mask 255.255.255.255
    partition test
    pool default_http.vs
    profiles {
        /Common/http { }
        /Common/tcp { }
    }
    source 0.0.0.0/0
    source-address-translation {
        type automap
    }
    translate-address enabled
    translate-port enabled
    vs-index 5
}
ltm virtual default_https.vs {
    destination 192.168.1.9:https
    ip-protocol tcp
    mask 255.255.255.255
    partition test
    pool default_https.vs
    profiles {
        /Common/clientssl {
            context clientside
        }
        /Common/http { }
        /Common/tcp { }
    }
    source 0.0.0.0/0
    source-address-translation {
        type automap
    }
    translate-address enabled
    translate-port enabled
    vs-index 6
}
Note the ltm pool has k8s node 192.168.1.3 (fed-master), 192.168.1.4 (fed-node2), port 32506 as ltm pool member
root@(bigip-kvm)(cfg-sync Standalone)(Active)(/test)(tmos)# list ltm pool
ltm pool default_http.vs {
    members {
        192.168.1.3:32506 {
            address 192.168.1.3
            session monitor-enabled
            state up
        }
        192.168.1.4:32506 {
            address 192.168.1.4
            session monitor-enabled
            state up
        }
    }
    monitor default_http.vs_0_http
    partition test
}
ltm pool default_https.vs {
    members {
        192.168.1.3:32506 {
            address 192.168.1.3
            session monitor-enabled
            state up
        }
        192.168.1.4:32506 {
            address 192.168.1.4
            session monitor-enabled
            state up
        }
    }
    monitor default_https.vs_0_http
    partition test
}

 5 test virtual traffic

[root@fed-node2 ~]# curl -k https://192.168.1.9
<!DOCTYPE html>
<html>
<head>
<title>Welcome to nginx!</title>
<style>
    body {
        width: 35em;
        margin: 0 auto;
        font-family: Tahoma, Verdana, Arial, sans-serif;
    }
</style>
</head>
<body>
<h1>Welcome to nginx!</h1>
<p>If you see this page, the nginx web server is successfully installed and
working. Further configuration is required.</p>

<p>For online documentation and support please refer to
<a href="http://nginx.org/">nginx.org</a>.<br/>
Commercial support is available at
<a href="http://nginx.com/">nginx.com</a>.</p>

<p><em>Thank you for using nginx.</em></p>
</body>
</html>

[root@fed-node2 ~]# curl  http://192.168.1.9
<!DOCTYPE html>
<html>
<head>
<title>Welcome to nginx!</title>
<style>
    body {
        width: 35em;
        margin: 0 auto;
        font-family: Tahoma, Verdana, Arial, sans-serif;
    }
</style>
</head>
<body>
<h1>Welcome to nginx!</h1>
<p>If you see this page, the nginx web server is successfully installed and
working. Further configuration is required.</p>

<p>For online documentation and support please refer to
<a href="http://nginx.org/">nginx.org</a>.<br/>
Commercial support is available at
<a href="http://nginx.com/">nginx.com</a>.</p>

<p><em>Thank you for using nginx.</em></p>
</body>
</html>



Friday, October 13, 2017

Import existing source code to GitHub

https://stackoverflow.com/questions/4658606/import-existing-source-code-to-github

down voteaccepted
If you've got local source code you want to add to a new remote new git repository without 'cloning' the remote first, do the following (I often do this - you create your remote empty repository in bitbucket/github, then push up your source)
  1. Create the remote repository, and get the URL such as git@github.com:/youruser/somename.git or https://github.com/youruser/somename.git
    If your local GIT repo is already set up, skips steps 2 and 3

  2. Locally, at the root directory of your source, git init
    2a. If you initialize the repo with a .gitignore and a README.md you should do a git pull {url from step 1} to ensure you don't commit files to source that you want to ignore ;)
  3. Locally, add and commit what you want in your initial repo (for everything, git add . then git commit -m 'initial commit comment')

  4. to attach your remote repo with the name 'origin' (like cloning would do)
    git remote add origin [URL From Step 1]
  5. Execute git pull origin master to pull the remote branch so that they are in sync.
  6. to push up your master branch (change master to something else for a different branch):
    git push origin master

Friday, June 30, 2017

KVM vhost performance tuning to enhance ADC VE throughput

As global Telecom companies start adopting ADC - Application delivery controller (Load balancer)  in their OpenStack environment, it becomes important to achieve high throughput for ADC VE instances, but unlike ADC hardware appliances, ADC VE runs in customer commodity hardware server with either Redhat/Ubuntu as Host OS and KVM as hypvervisor, so it becomes important to know the underlying technologies to tune the hypvervisor environment for best performance, here are some hands on experience tuning KVM vhost to achieve ideal throughput.

Lab equipment

Dell Poweredge R710 (16 cores) + Intel 82599 10G NIC + 72G RAM
Dell Poweredge R210 (8 cores) + Intel 82599 10G NIC + 32G RAM

Network setup:

 /external vlan|<------------->|eth1 <--->iperf client \
| Dell R710(ADC VE)           Dell R210               |
 \Internal vlan|<------------->|eth2 <--->iperf server /


Note since I only have two physical  servers, Dell R710 as host for ADC VE, I have to 
use Dell R210 as both iperf server and iperf client, so I used Linux network namespace to 
isolate the IP and route spaces so the iperf client packet can egress out the physical NIC eth1,
forwarded by BIGIP VE, back into physical NIC eth2 to be processed by iperf server, here is 
simple bash script  to setup linux network namespace:



#!/usr/bin/env bash

set -x

NS1="ns1"
NS2="ns2"
DEV1="em1"
DEV2="em2"
IP1="10.1.72.62"
IP2="10.2.72.62"
NET1="10.1.0.0/16"
NET2="10.2.0.0/16"
GW1="10.1.72.1"
GW2="10.2.72.1"

if [[ $EUID -ne 0 ]]; then
    echo "You must be root to run this script"
    exit 1
fi

# Remove namespace if it exists.
ip netns del $NS1 &amp;&gt;/dev/null
ip netns del $NS2 &amp;&gt;/dev/null

# Create namespace
ip netns add $NS1
ip netns add $NS2

#add physical interface to namespace
ip link set dev $DEV1  netns $NS1
ip link set dev $DEV2  netns $NS2




# Setup namespace IP .
ip netns exec $NS1 ip addr add $IP1/16 dev $DEV1
ip netns exec $NS1 ip link set $DEV1 up
ip netns exec $NS1 ip link set lo up
ip netns exec $NS1 ip route add $NET2 via $GW1 dev $DEV1

ip netns exec $NS2 ip addr add $IP2/16 dev $DEV2
ip netns exec $NS2 ip link set $DEV2 up
ip netns exec $NS2 ip link set lo up
ip netns exec $NS2 ip route add $NET1 via $GW2 dev $DEV2

# Enable IP-forwarding.
echo 1 &gt; /proc/sys/net/ipv4/ip_forward

# Get into namespace
#ip netns exec ${NS} /bin/bash --rcfile &lt;(echo "PS1=\"${NS}&gt; \"")

On ADC VE I setup a simple forwarding virtual server to simply forward the packet, this is 
default throughput output without any performance tuning:

ns1&gt; /home/dpdk/iperf -c 10.2.72.62 -l 1024 -P 64
...............
................
[ 25]  0.0-10.2 sec  46.0 MBytes  37.9 Mbits/sec
[SUM]  0.0-10.2 sec  3.22 GBytes  2.72 Gbits/sec &lt;======= 2.72Gbits

here is the top output of vhost dataplane kernel thread for the ADC VE look like while passing traffic:

 PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                                                                 P
23329 libvirt+  20   0 35.366g 0.030t  23396 S 262.5 43.4 153:31.10 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime m+  1
23332 root      20   0       0      0      0 R  17.9  0.0   1:35.98 [vhost-23329]                                                                                                           1
23336 root      20   0       0      0      0 R  17.9  0.0   1:18.20 [vhost-23329]


as you can see there are two vhost kernel thread showing up with 17.9% CPU usage, which indicates
vhost is not fully scheduled to pass data traffic for the guest machine. I have defined 4 tx/rx queues pair
for the macvtap on the physical 10G interface and two macvtap assigned to the ADC VE for external and internal vlan
, ideally, there should be 8 vhost kernel threads showing up from top that is fully scheduled to pass traffic

for example the interface xml dump  as below:


    

<interface type='bridge'>
      <mac address='52:54:00:55:47:05'/>
      <source bridge='br0'/>
      <target dev='vnet1'/>
      <model type='virtio'/>
      <alias name='net0'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
    </interface>
    <interface type='direct'>
      <mac address='52:54:00:f9:98:e9'/>
      <source dev='enp4s0f0' mode='vepa'/>
      <target dev='macvtap2'/>
      <model type='virtio'/>
      <driver name='vhost' queues='4'/>
      <alias name='net1'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x0'/>
    </interface>
    <interface type='direct'>
      <mac address='52:54:00:4b:06:c4'/>
      <source dev='enp4s0f1' mode='vepa'/>
      <target dev='macvtap3'/>
      <model type='virtio'/>
      <driver name='vhost' queues='4'/>
      <alias name='net2'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x09' function='0x0'/>
    </interface>

vCPU pin assigned

root@Dell710:~# virsh vcpupin bigip-virtio
VCPU: CPU Affinity
----------------------------------
   0: 0
   1: 2
   2: 4
   3: 6
   4: 8
   5: 10
   6: 12
   7: 14
   8: 2
   9: 4

 vhost cpu pin:

~#  virsh emulatorpin bigip-virtio
emulator: CPU Affinity
----------------------------------
       *: 0,2,4,6,8,10,12,14

NUMA node:

# lscpu --parse=node,core,cpu
# The following is the parsable format, which can be fed to other
# programs. Each different item in every column has an unique ID
# starting from zero.
# Node,Core,CPU
0,0,0
1,1,1
0,2,2
1,3,3
0,4,4
1,5,5
0,6,6
1,7,7
0,0,8
1,1,9
0,2,10
1,3,11
0,4,12
1,5,13
0,6,14
1,7,15

so the odd CPU is on NUMA node 1, even CPU is on NUMA node 0, guest is pined to NUMA node 0 and vhost is pined to NUMA node 0 too
which should be good. why the lower throughput.

lets try assign the vhost to NUMA node 1 CPU:

# virsh emulatorpin bigip-virtio 1,3,5,7,9,11,13,15


#  virsh emulatorpin bigip-virtio
emulator: CPU Affinity
----------------------------------
       *: 1,3,5,7,9,11,13,15
now runs the test again:
[SUM]  0.0-10.1 sec  10.1 GBytes  8.58 Gbits/sec <=========8.58G, big difference!!!


  PID USER      PR  NI    VIRT    RES    SHR S %CPU %MEM     TIME+ COMMAND                                                                                                                  P
23344 libvirt+  20   0 35.350g 0.030t  23396 R 99.9 43.4  15:40.95 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+  6
23341 libvirt+  20   0 35.350g 0.030t  23396 R 99.9 43.4  17:39.58 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+  0
23346 libvirt+  20   0 35.350g 0.030t  23396 R 99.9 43.4  15:23.76 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+ 10
23347 libvirt+  20   0 35.350g 0.030t  23396 R 99.9 43.4  15:29.99 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+ 12
23345 libvirt+  20   0 35.350g 0.030t  23396 R 99.7 43.4  15:29.29 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+  8
23348 libvirt+  20   0 35.350g 0.030t  23396 R 99.7 43.4  15:42.95 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+ 14
23342 libvirt+  20   0 35.350g 0.030t  23396 R 98.7 43.4  14:58.66 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+  2
23343 libvirt+  20   0 35.350g 0.030t  23396 R 96.0 43.4  14:58.54 qemu-system-x86_64 -enable-kvm -name bigip-virtio -S -machine pc-i440fx-trusty,accel=kvm,usb=off -m 31357 -realtime ml+  4
23332 root      20   0       0      0      0 R 40.2  0.0   1:12.12 [vhost-23329]                                                                                                           15
23333 root      20   0       0      0      0 R 40.2  0.0   1:05.58 [vhost-23329]                                                                                                           13
23335 root      20   0       0      0      0 R 40.2  0.0   1:04.98 [vhost-23329]                                                                                                            3
23334 root      20   0       0      0      0 R 39.2  0.0   1:04.52 [vhost-23329]                                                                                                            1
23337 root      20   0       0      0      0 R 32.2  0.0   0:47.66 [vhost-23329]                                                                                                           11
23339 root      20   0       0      0      0 R 31.6  0.0   0:50.47 [vhost-23329]                                                                                                           15
23336 root      20   0       0      0      0 S 31.2  0.0   0:56.08 [vhost-23329]                                                                                                            5
23338 root      20   0       0      0      0 R 30.2  0.0   0:49.52 [vhost-23329] 


this tells that something in host kernel is using NUMA node 0 CPUs that 8 vhost thread unable to get scheduled more
enough to process the data traffic. my theory is that physical NIC IRQ is spread to even cores on NUMA node 0 and softirq runs 
high on even cores, the vhost kernel thread didn't get enough time to run on even core, assigning the vhost to idle cores in 
NUMA node 1 so vhost get enough CPU cycles to process the data packet
      
 




Thursday, March 9, 2017

Intrusion Prevention System - Snort

A typical process flow:


 221         /* Not a completely ideal place for this since any entries added on the
 222          * PacketCallback -> ProcessPacket -> Preprocess trail will get
 223          * obliterated - right now there isn't anything adding entries there.
 224          * Really need it here for stream5 clean exit, since all of the
 225          * flushed, reassembled packets are going to be injected directly into
 226          * this function and there may be enough that the obfuscation entry
 227          * table will overflow if we don't reset it.  Putting it here does
 228          * have the advantage of fewer entries per logging cycle */




SnortMain -> PacketLoop -> PacketCallback -> ProcessPacket -> Preprocess ->Detect ->fpEvalPacket ->fpEvalHeaderTcp

Wednesday, May 18, 2016

Patch to make mTCP running in VM environment to send packet between VM and physical server across switches

when running mTCP in VM environment like in VMware ESXi, KVM...the source MAC is zero see https://github.com/eunyoung14/mtcp/issues/51, this could result in packet being dropped, following patches avoid adding DPDK PMD driver for mTCP to save the porting effort.




 diff --git a/mtcp/src/io_module.c b/mtcp/src/io_module.c
 index ad3e01d..83e0893 100644
 --- a/mtcp/src/io_module.c
 +++ b/mtcp/src/io_module.c
 @@ -63,6 +63,22 @@ GetNumQueues()
     return queue_cnt;
  }
  /*----------------------------------------------------------------------------*/
 +
 +static int GetPortIndex(char *dev_name)
 +{
 +    char *p = dev_name;
 +    long val = -1;
 +    while (*p) { // While there are more characters to process...
 +        if (isdigit(*p)) { // Upon finding a digit, ...
 +            val = strtol(p, &p, 10); // Read a number, ...
 +        } else {
 +            p++;
 +        }
 +    }
 +    return (int)val;
 +}
 +
 +
  int
  SetInterfaceInfo(char* dev_name_list)
  {
 @@ -243,9 +259,10 @@ SetInterfaceInfo(char* dev_name_list)
                     CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin;
                 }
 -                if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0 ) {
 +                if(strstr(iter_if->ifa_name, "dpdk") != NULL) {
 +                    ret = GetPortIndex(iter_if->ifa_name);
                     for (j = 0; j < ETH_ALEN; j ++) {
 -                        CONFIG.eths[eidx].haddr[j] = ifr.ifr_addr.sa_data[j];
 +                        CONFIG.eths[eidx].haddr[j] = ports_eth_addr[ret].addr_bytes[j];
                     }
                 }

DPDK pktgen to generate SYN flood

hack patch to make pktgen to do syn flood:



diff --git a/app/cmd-functions.c b/app/cmd-functions.c
 index b2fda7c..c348e73 100644
 --- a/app/cmd-functions.c
 +++ b/app/cmd-functions.c
 @@ -303,6 +303,8 @@ const char *help_info[] = {
     "pkt.size max <portlist> value   - Set pkt size maximum address",
     "pkt.size inc <portlist> value   - Set pkt size increment address",
     "range <portlist> <state>      - Enable or Disable the given portlist for sending a range of packets",
 +    "range.proto <portlist> [tcp|udp|icmp]",
 +    "                  - Set ip proto for sending a range of packets",
     "",
     "<<PageBreak>>",
     "    Flags: P---------------- - Promiscuous mode enabled",
 diff --git a/app/pktgen-tcp.c b/app/pktgen-tcp.c
 index 3c8a853..9d12a88 100644
 --- a/app/pktgen-tcp.c
 +++ b/app/pktgen-tcp.c
 @@ -69,6 +69,26 @@
  #include "pktgen-tcp.h"
 +uint64_t xor_seed[ 2 ];
 +
 +static inline uint64_t
 +xor_next(void) {
 +    uint64_t s1 = xor_seed[ 0 ];
 +    const uint64_t s0 = xor_seed[ 1 ];
 +
 +    xor_seed[ 0 ] = s0;
 +    s1 ^= s1 << 23;                 /* a */
 +    return ( xor_seed[ 1 ] = ( s1 ^ s0 ^ ( s1 >> 17 ) ^ ( s0 >> 26 ) ) ) +
 +        s0;               /* b, c */
 +}
 +
 +static __inline__ uint32_t
 +pktgen_default_rnd_func(void)
 +{
 +    return xor_next();
 +}
 +
 +
  /**************************************************************************//**
  *
  * pktgen_tcp_hdr_ctor - TCP header constructor routine.
 @@ -100,10 +120,10 @@ pktgen_tcp_hdr_ctor(pkt_seq_t *pkt, tcpip_t *tip, int type __rte_unused)
     tip->tcp.sport   = htons(pkt->sport);
     tip->tcp.dport   = htons(pkt->dport);
 -    tip->tcp.seq    = htonl(DEFAULT_PKT_NUMBER);
 -    tip->tcp.ack    = htonl(DEFAULT_ACK_NUMBER);
 +    tip->tcp.seq    = htonl(pktgen_default_rnd_func());
 +    tip->tcp.ack    = 0;
     tip->tcp.offset   = ((sizeof(tcpHdr_t) / sizeof(uint32_t)) << 4);   /* Offset in words */
 -    tip->tcp.flags   = ACK_FLAG;                     /* ACK */
 +    tip->tcp.flags   = SYN_FLAG;                     /* ACK */
     tip->tcp.window   = htons(DEFAULT_WND_SIZE);
     tip->tcp.urgent   = 0;


root@pktgen-template:/home/admin/pktgen-dpdk/dpdk/examples/pktgen-dpdk# ./app/app/x86_64-native-linuxapp-gcc/pktgen -c ff   -- -P -m "[0:0-7].0 "
 Copyright (c) <2010-2016>, Intel Corporation. All rights reserved.
   Pktgen created by: Keith Wiles -- >>> Powered by Intel® DPDK <<<

Lua 5.3.2  Copyright (C) 1994-2015 Lua.org, PUC-Rio
>>> Packet Burst 32, RX Desc 512, TX Desc 512, mbufs/port 4096, mbuf cache 512

=== port to lcore mapping table (# lcores 8) ===
   lcore:     0     1     2     3     4     5     6     7
port   0:  D: T  0: 1  0: 1  0: 1  0: 1  0: 1  0: 1  0: 1 =  1: 8
Total   :  1: 1  0: 1  0: 1  0: 1  0: 1  0: 1  0: 1  0: 1
    Display and Timer on lcore 0, rx:tx counts per port/lcore

Configuring 1 ports, MBUF Size 1920, MBUF Cache Size 512
Lcore:
    0, RX-TX
                RX( 1): ( 0: 0)
                TX( 1): ( 0: 0)
    1, TX-Only
                TX( 1): ( 0: 1)
    2, TX-Only
                TX( 1): ( 0: 2)
    3, TX-Only
                TX( 1): ( 0: 3)
    4, TX-Only
                TX( 1): ( 0: 4)
    5, TX-Only
                TX( 1): ( 0: 5)
    6, TX-Only
                TX( 1): ( 0: 6)
    7, TX-Only
                TX( 1): ( 0: 7)

Port :
    0, nb_lcores  8, private 0x8f0690, lcores:  0  1  2  3  4  5  6  7



** Dev Info (rte_vmxnet3_pmd:0) **
   max_vfs        :   0 min_rx_bufsize    :1646 max_rx_pktlen : 16384 max_rx_queues         :  16 max_tx_queues:   8
   max_mac_addrs  :   1 max_hash_mac_addrs:   0 max_vmdq_pools:     0
   rx_offload_capa:  13 tx_offload_capa   :  45 reta_size     :     0 flow_type_rss_offloads:0000000000000514
   vmdq_queue_base:   0 vmdq_queue_num    :   0 vmdq_pool_base:     0
** RX Conf **
   pthreash       :   0 hthresh          :   0 wthresh        :     0
   Free Thresh    :   0 Drop Enable      :   0 Deferred Start :     0
** TX Conf **
   pthreash       :   0 hthresh          :   0 wthresh        :     0
   Free Thresh    :   0 RS Thresh        :   0 Deferred Start :     0 TXQ Flags:00000200

Initialize Port 0 -- TxQ 8, RxQ 1,  Src MAC 00:50:56:86:10:76
Pktgen > load tcp.txt
Pktgen> start 0
Pktgen> stop 0
root@pktgen-template:/home/admin/pktgen-dpdk/dpdk/examples/pktgen-dpdk# cat tcp.txt
#
# Pktgen - Ver: 2.9.17 (DPDK 16.04.0-rc2)
# Copyright (c) <2010-2016>, Intel Corporation. All rights reserved., Powered by Intel® DPDK

# Command line arguments: (DPDK args are defaults)
# ./app/app/x86_64-native-linuxapp-gcc/pktgen -c ff -n 3 -m 512 --proc-type primary -- -P -m [0:1-7].0

#######################################################################
# Pktgen Configuration script information:
#   GUI socket is Not Enabled
#   Flags 00040004
#   Number of ports: 1
#   Number ports per page: 4
#   Number descriptors: RX 512 TX: 512
#   Promiscuous mode is Enabled


#######################################################################
# Global configuration:
geometry 132x44
mac_from_arp disable

######################### Port  0 ##################################
#
# Port:  0, Burst: 32, Rate:100%, Flags:c0000010, TX Count:Forever
#           SeqCnt:0, Prime:1 VLAN ID:0001, Link:
#
# Set up the primary port information:
set 0 count 0
set 0 size 64
set 0 rate 100
set 0 burst 32
set 0 sport 1234
set 0 dport 5678
set 0 prime 1
type ipv4 0
proto tcp 0
set ip dst 0 10.1.72.17
#set ip dst 0 10.1.72.8
set ip src 0 10.1.72.154/24
set mac 0 00:23:E9:63:5B:83
#set mac 0 00:50:56:86:84:90
vlanid 0 1

pattern 0 zero
user.pattern 0 0123456789abcdef

latency 0 disable
mpls 0 disable
mpls_entry 0 0
qinq 0 disable
qinqids 0 0 0
gre 0 disable
gre_eth 0 disable
gre_key 0 0
#
# Port flag values:
icmp.echo 0 disable
pcap 0 disable
range 0 enable
process 0 disable
capture 0 disable
rxtap 0 disable
txtap 0 disable
vlan 0 disable

#
# Range packet information:
src.mac start 0 00:50:56:86:10:76
src.mac min 0 00:00:00:00:00:00
src.mac max 0 00:00:00:00:00:00
src.mac inc 0 00:00:00:00:00:00
dst.mac start 0 00:23:E9:63:5B:83
#dst.mac start 0 00:50:56:86:84:90
dst.mac min 0 00:00:00:00:00:00
dst.mac max 0 00:00:00:00:00:00
dst.mac inc 0 00:00:00:00:00:00

src.ip start 0 10.1.72.154
src.ip min 0 10.1.72.154
src.ip max 0 10.1.72.254
src.ip inc 0 0.0.0.1

dst.ip start 0 10.1.72.17
dst.ip min 0 10.1.72.17
dst.ip max 0 10.1.72.17
dst.ip inc 0 0.0.0.1

#dst.ip start 0 10.1.72.8
#dst.ip min 0 10.1.72.8
#dst.ip max 0 10.1.72.8
#dst.ip inc 0 0.0.0.1

src.port start 0 1025
src.port min 0 1025
src.port max 0 65512
src.port inc 0 1

dst.port start 0 80
dst.port min 0 0
dst.port max 0 0
dst.port inc 0 0

vlan.id start 0 1
vlan.id min 0 1
vlan.id max 0 4095
vlan.id inc 0 0

pkt.size start 0 64
pkt.size min 0 64
pkt.size max 0 1518
pkt.size inc 0 0

#
# Set up the sequence data for the port.
set 0 seqCnt 0

################################ Done #################################

Followers