run KISS: 2024

Monday, April 15, 2024

Streaming Messages to a Go gRPC Server

In this post we will demonstrate streaming gRPC messages to a server using Go.

Create proto file

The proto file describes the gRPC service APIs. In this example we create a single API to send stream of persons from the client to the server.

persons.proto

syntax = "proto3";
option go_package = "my.example.com/com/grpctemplates";

service Persons {
  rpc StreamPersons(stream Person) returns(ProcessedIndication){

  }
}

message Person{
  string name = 1;
  int32 age = 2;
}

message ProcessedIndication{

}

Generate Go templates

To generate go sources using the proto file, we first install the required tools:

sudo apt install -y protobuf-compiler
protoc --version  
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2

Next we run the tools to generate the Go templates:

export PATH="$PATH:$(go env GOPATH)/bin"
protoc --go_out=. --go_opt=paths=source_relative \
    --go-grpc_out=. --go-grpc_opt=paths=source_relative \
    persons.proto

rm -rf grpctemplates
mkdir grpctemplates
mv persons.pb.go grpctemplates/
mv persons_grpc.pb.go grpctemplates/

The Main

In our example we will run both the client and the server in the same process.

package main

import (
  "grpcexample/personsclient"
  "grpcexample/personsserver"
  "time"
)

func main() {
  go func() {
   time.Sleep(time.Second)
   personsclient.RunClient()
  }()

  personsserver.RunServer()
}

The Server

The server implements an API to get the stream of persons, prints them, and return a complete indication.

package personsserver

import (
  "fmt"
  "google.golang.org/grpc"
  "grpcexample/grpctemplates"
  "io"
  "log"
  "net"
)

type personsServer struct {
  grpctemplates.UnimplementedPersonsServer
}

func (s *personsServer) StreamPersons(stream grpctemplates.Persons_StreamPersonsServer) error {
  for {
   person, err := stream.Recv()
   if err == io.EOF {
    return stream.SendAndClose(&grpctemplates.ProcessedIndication{})
   }
   if err != nil {
    panic(err)
   }

   log.Printf("got person %v\n", person.Name)
  }
}

func RunServer() {
  log.Print("starting gRPC server")
  port := 8080
  listener, err := net.Listen("tcp", fmt.Sprintf("localhost:%d", port))
  if err != nil {
   panic(err)
  }

  persons := personsServer{}
  grpcServer := grpc.NewServer()
  grpctemplates.RegisterPersonsServer(grpcServer, &persons)

  err = grpcServer.Serve(listener)
  if err != nil {
   panic(err)
  }
}

The Client

The client sends a stream of persons to the server, and waits for completion before closing the connection.

package personsclient

import (
  "context"
  "fmt"
  "google.golang.org/grpc"
  "google.golang.org/grpc/credentials/insecure"
  "grpcexample/grpctemplates"
  "log"
)

func RunClient() {
  log.Printf("client sending data starting\n")

  connection, err := grpc.Dial(
   "localhost:8080",
   grpc.WithTransportCredentials(insecure.NewCredentials()),
  )
  if err != nil {
   panic(err)
  }

  defer func() {
   err = connection.Close()
   if err != nil {
    panic(err)
   }
  }()

  client := grpctemplates.NewPersonsClient(connection)
  stream, err := client.StreamPersons(context.Background())
  if err != nil {
   panic(err)
  }

  for i := range 10 {
   person := grpctemplates.Person{
    Name: fmt.Sprintf("person %v", i),
    Age:  int32(i),
   }
   err = stream.Send(&person)
   if err != nil {
    panic(err)
   }
  }
  _, err = stream.CloseAndRecv()
  if err != nil {
   panic(err)
  }
  log.Printf("client sending data done\n")
}

Monday, April 8, 2024

GraphQL Usage in Go

This post includes an example of GraphQL usage in Go. The actual GraphQL implementation is using the graphql-go library.

Some insights I got from this:

Implementing code in GraphQL is much more complicated than using REST. We need to re-write the specifications for each usage, and repeat the fields names. It is not build in to the language like GO and simple JSON marshaling.
To use mutation drop the root parenthesis in the query. OMG I've spent an hour trying to understand why it is not working.
I believe the advantage of GraphQL over REST is dynamic fields selection. Don't know many applications where this matters.
Authorization is not full built into GraphQL. I guess this is why we have so many security issues in applications using GraphQL.

package main

import (
  "encoding/json"
  "fmt"
  "github.com/graphql-go/graphql"
)

type dbActor struct {
  Name      string
  BirthYear int
}

var dbActors = []*dbActor{
  {
   Name:      "John Travolta",
   BirthYear: 1954,
  },
  {
   Name:      "Robert Redford",
   BirthYear: 1936,
  },
}

func main() {
  schema := createSchema()
  var query string

  query = `
   {
    list{
     name
     birthYear
    }
   }
  `
  runQuery(schema, query)

  query = `
   {
    actor(name: "Robert Redford"){
     birthYear
    }
   }
  `
  runQuery(schema, query)

  query = `
    mutation {
     create(name:"Meryl Streep",birthYear:1949){
      name
     }
    }
  `
  runQuery(schema, query)

  query = `
   {
    list{
     name
     birthYear
    }
   }
  `
  runQuery(schema, query)

}

func runQuery(schema graphql.Schema, query string) {
  params := graphql.Params{
   Schema:        schema,
   RequestString: query,
  }

  result := graphql.Do(params)
  if len(result.Errors) > 0 {
   panic(fmt.Errorf("failed to execute graphql operation, errors: %+v", result.Errors))
  }

  jsonData, err := json.MarshalIndent(result, "", "  ")
  if err != nil {
   panic(err)
  }

  fmt.Printf("\n%s\n", jsonData)
}

func createSchema() graphql.Schema {
  schemaActor := graphql.NewObject(graphql.ObjectConfig{
   Name: "actor",
   Fields: graphql.Fields{
    "name": &graphql.Field{
     Type: graphql.String,
    },
    "birthYear": &graphql.Field{
     Type: graphql.Int,
    },
   },
  })

  schemaQuery := graphql.NewObject(graphql.ObjectConfig{
   Name: "QueryRoot",
   Fields: graphql.Fields{
    "list": &graphql.Field{
     Type: graphql.NewList(schemaActor),
     Resolve: func(params graphql.ResolveParams) (interface{}, error) {
      return dbActors, nil
     },
    },
    "actor": &graphql.Field{
     Type: schemaActor,
     Args: graphql.FieldConfigArgument{
      "name": &graphql.ArgumentConfig{
       Type: graphql.String,
      },
     },
     Resolve: func(params graphql.ResolveParams) (interface{}, error) {
      name, ok := params.Args["name"].(string)
      if !ok {
       panic("name argument invalid")
      }
      for _, actor := range dbActors {
       if actor.Name == name {
        return actor, nil
       }
      }
      return nil, nil
     },
    },
   },
  })

  schemaMutation := graphql.NewObject(graphql.ObjectConfig{
   Name: "Mutation",
   Fields: graphql.Fields{
    "create": &graphql.Field{
     Type: schemaActor,
     Args: graphql.FieldConfigArgument{
      "name": &graphql.ArgumentConfig{
       Type: graphql.NewNonNull(graphql.String),
      },
      "birthYear": &graphql.ArgumentConfig{
       Type: graphql.NewNonNull(graphql.Int),
      },
     },
     Resolve: func(params graphql.ResolveParams) (interface{}, error) {
      name, ok := params.Args["name"].(string)
      if !ok {
       panic("name argument invalid")
      }
      birthYear, ok := params.Args["birthYear"].(int)
      if !ok {
       panic("birthYear argument invalid")
      }
      actor := dbActor{
       Name:      name,
       BirthYear: birthYear,
      }
      dbActors = append(dbActors, &actor)
      return actor, nil
     },
    },
   },
  })

  schemaConfig := graphql.SchemaConfig{
   Query:    schemaQuery,
   Mutation: schemaMutation,
  }
  schema, err := graphql.NewSchema(schemaConfig)
  if err != nil {
   panic(err)
  }
  return schema
}

Monday, April 1, 2024

AWS Application Load Balancer Vs. AWS Network Load Balancer

In this post we will review the AWS load balancers types, and investigate the tasks each load balancer is suitable to perform. We will review this as a load balancer serving clients that access services in a kubernetes cluster.

ALB: AWS Application Load Balancer

An ALB is a communication layer 7 creature.

An ALB distributes traffic among multiple kubernetes cluster services' pods. A single ALB can serve multiple services, and by traffic metadata such as host name and path the ALB selects the target service. Once a service is located, the ALB can route the HTTP request directly to the related pod, while distributing the requests between all the pods that are READY for service. Notice that the ALB's health check differs from the health and ready check configured by the pod, and in case it is not the default access to the slash, special ALB configuration should be made.

ALB handles both HTTP 1.x and HTTP 2.x requests, which means that it can also handle gRPC protocol.

ALB can also supply additional layer 7 functions such as authentication and WAF.

NLB: AWS Network Load Balancer

A NLB is a communication layer 4 creature.

A NLB distributes incoming UDP/TCP connections among a single kubernetes cluster service pods. A single NLB can serve only single service.

The incoming connection is proxy to one of the READY service pods. Once the NLB established the TCP connection to the target pod, it will stay connected as long as the client and the pod keep the connection active. The NLB does not inspect the data transferred in the TCP connection, and hence cannot distribute requests to another pod.

Which Should I use?

Let's start with the price: ALB cost is 0.008$ per LCU, while NLB cost is 0.006$ per LCU.

So ALB costs more, but it has a major advantage: It distributes per request, and not per TCP connection. This is critical in case we have multiple pods, and long lasting client connection.

ALB IP address must be resolved through DNS, while NLB uses a static IP address, hence if we cannot use DNS, we must use NLB.

Hence to choose the appropriate load balancer type:

If we must use static IP address - use NLB
If we use non HTTP protocol - use NLB
If we have single service with single pod - use NLB
If we have single service with short lasting client sockets - use NLB
For other cases - use ALB

Monday, March 25, 2024

Using KEDA with Promethues scaler

In this post we will show a simple usage of KEDA with prometheus scaler to set the replicas number of a deployment.

KEDA is an event driven autoscaler, it wraps up the kubernetes horizontal pod autoscaler and simplify autoscaling while enabling scaler by a huge collection of scale metrics sources.

Deploy KEDA

Deploying KEDA is done by using helm chart:

helm repo add kedacore https://kedacore.github.io/charts
helm repo update
helm install keda kedacore/keda --namespace keda --create-namespace

Deploy a Scaledobject

A scaledobject is a configuration of the required auto scaling. We deploy the following scaledobject:

apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
  name: my-scaledobject
  namespace: default
spec:
  minReplicaCount: 1
  maxReplicaCount: 5
  pollingInterval: 5
  cooldownPeriod: 30
  scaleTargetRef:
    name: my-deployment
    kind: Deployment
    apiVersion: apps/v1
  triggers:
    - type: prometheus
      metadata:
        serverAddress: http://prometheus-service.default:80
        threshold: '100'
        query: sum (rate(handled_items[1m]))

We specify the min and max replicas, as well as the polling interval, and the cooldown internal.

The scaled object target is a deployment.

The metric source is prometheus, whose service address must be supplied. The scale is done using a promethues metric value. In this example, we set the threshold to 100, which means that above an average of 100 per pod, KEDA will scale up the replicas amount.

Monday, March 18, 2024

Creating graphs using Graphvis dot file

Graphvis is an open source graph visualization software. One of the covered aspects is a standard for DOT file, which describes a graph. These graphs can be later visualized using related software and also in online visualization sites such as Graphvis Online.

In this post we will explore the dot file various capabilities.

For more amazing graphs, see the gallery.

A simple undirected graph

graph test {
    a -- b -- c;
}

A simple directed graph

digraph test {
    a -> b -> c;
}

Multiple graphs with styles

To have a sub-graph in its own box: use the prefix "cluster" in the graph name.
Edge connections:

node to node edges
cluster to cluster edges

Use "node" to apply attributes to all nodes in the scope

digraph {
    
    compound=true;
    
    subgraph cluster_a{
        label="A";
        node [style=filled fillcolor="#ff00ff" fontcolor="white" shape=box];
        a1[shape=star];
        a1 -> {a2 a3};
    }
    
    subgraph cluster_b{
        label="B";
        b1 -> b2;
    }
 
    a1 -> b1[label="nodes edge"];
    a2 -> b2[label="clusters edge" ltail="cluster_a" lhead="cluster_b"];   
}

Record node

Enables multiple segments in a node.
We can tag a segment, and use it later in an edge.

digraph {
    
    subgraph x{
        node[shape="record"];
        a[label="{line 1|<x>line2}"];
        b[label="{line1|{lin2_col1|<y>line2_col2|line2_col3}}"];
        a:x->b:y;
    }
    
}

Use HTML in a label

We can use HTML for a label, but only in a table format.

digraph {
    
    staging [
       label=<<table border="0" cellborder="1" cellspacing="0" cellpadding="4">
          <tr> <td> <b>important</b></td> </tr>
          <tr> <td> to have fun</td> </tr>
       </table>>
       shape=plain
    ]
    
}

Monday, March 11, 2024

Dynamically Allocate and Delete PersistentVolumeClaim for a CronJob

In this post we will review the steps to dynamically create and delete a PersistentVolumeClaim for a CronJob.

A CronJob might require a large amount of temporary storage, and we don't want to keep the PersistenceVolumeClaim active while the job is not running, since the cost might be high. For example, assumeing we have a CronJob running once in a week for 3 hours, and required 1TB disk space for calculations. The cost of leaving such a disk active for an entire week is very high, hence we should dynamically allocate and remove the disk.

Kubernetes does not supply out of the box mechanism to handle this, hence we can do it ourselves. We handle this by 3 CronJobs:

1. The allocate CronJob which create the PVC

2. The actual computation CronJob

3. The cleanup CronJob

The Allocate CronJob

This CronJob creates the PVC before the computation job is starting. It should use a schedule that run it just a few minutes before the computation job. The following are the kubernetes entities to run the allocate CronJob. Notice that we must also provide permissions for handling the PVC.

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: allocate-role
rules:
  - apiGroups: [ "" ]
    resources: [ "persistentvolumes" ]
    verbs: ["create","list","delete","get","patch","watch"]

  - apiGroups: [ "" ]
    resources: [ "persistentvolumeclaims" ]
    verbs: ["create","list","delete","get","patch","watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: allocate-role-binding
subjects:
  - kind: ServiceAccount
    name: allocate-service-account
    namespace: default
roleRef:
  kind: ClusterRole
  name: allocate-role
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: allocate-service-account
  namespace: default
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: allocate-config
data:
  pvc.yaml: |-
    
    
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: my-pvc
      labels:
        type: local
    spec:
      storageClassName: "gp2"
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: 1000Gi

---
kind: CronJob
metadata:
  name: allocate-cronjob
spec:
  schedule: "0 0 * * *"
  startingDeadlineSeconds: 36000
  concurrencyPolicy: Replace
  timeZone: "Etc/UTC"
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      template:
        spec:
          serviceAccountName: allocate-service-account
          restartPolicy: Never
          containers:
            - name: cleanup
              image: repo/allocate/dev:latest
              imagePullPolicy: IfNotPresent
              env:
                - name: PVC_NAME
                  value: my-pvc
                - name: NAMESPACE
                  value: default              
              volumeMounts:
                - name: config
                  mountPath: /config
          volumes:
            - name: config
              configMap:
                name: allocate-config

The allocate image is a simple script that runs kubectl to create the PVC:

#!/usr/bin/env bash
set -e
set -x

echo "prepare starting"

kubectl delete pvc ${PV_NAME} --namespace ${NAMESPACE} --ignore-not-found=true
kubectl apply -f /config/pvc.yaml

echo "prepare done"

The Cleanup CronJob

The Cleanup CronJob runs after the computation job completion and deletes the PVC. This includes the following kubernetes entities:

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: cleanup-role
rules:
  - apiGroups: [ "" ]
    resources: [ "persistentvolumeclaims" ]
    verbs: ["create","list","delete","get","patch","watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: cleanup-role-binding
subjects:
  - kind: ServiceAccount
    name: cleanup-service-account
    namespace: default
roleRef:
  kind: ClusterRole
  name: cleanup-role
  apiGroup: rbac.authorization.k8s.io

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: cleanup-service-account
  namespace: default

---
apiVersion: batch/v1
kind: CronJob
metadata:
  name: cleanup-cronjob
spec:
  schedule: "0 4 * * *"
  startingDeadlineSeconds: 36000
  concurrencyPolicy: Replace
  timeZone: "Etc/UTC"
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      template:
        spec:
          serviceAccountName: cleanup-service-account
          restartPolicy: Never
          containers:
            - name: cleanup
              image: repo/cleanup/dev:latest
              imagePullPolicy: IfNotPresent
              env:
                - name: PV_NAME
                  value: attackrepo-pv-0
                - name: NAMESPACE
                  value: default

The cleaner image runs the following script:

#!/usr/bin/env bash
set -e
set -x

echo "cleanup starting"

kubectl delete pvc ${PV_NAME} --namespace ${NAMESPACE} --ignore-not-found=true


echo "cleanup done"

Monday, February 26, 2024

Dall-E3 Advanced Prompts Guidelines

Recently, OpenAI had recently released Dall-E3, a great improvement over the Dall-E2 image generator. You may have used it and generated some images using simple text prompts. The real "art" in this game is understanding prompt and how to design it to get accurate results.

The image at the top of this blog was generated using the prompt:

a photo of Salvador Dali drawing on a screen of a laptop

While it is a nice image, I had a different idea in mind when supplying the prompt. How can we create a better prompt that will generate the image we've had in mind?

The general guideline is to split the prompt to multiple statements, each statement adding more requirement to the image generator, for example:

Main object: The main object is the artist Salvador Dali. Behavior: Salvador Dali is painting a cute puppy on a laptop. Environment: The artist is located in an artist studio room. Items: There will be various painting related items spread in a mess around in the room. These items include: colors palette, colors buckets, various brushes in different size. Add additional painting related tools. Colors: Use brownish and yellowish background colors

And the result is:

We can see a great improvement, though still not perfect, the image generator is starting to get the idea we have in mind. Now that we have the right concept, we can add more statements, or update the existing statements as if we're programming a multiple layers application.

Let's give it another try:

Main object: The main object is the aritst Salvador Dali. Behavior: Salvador Dali stands and draws a cute puppy on a laptop. Environment: The artist is located in an artist studio room. Items: There will be various painting related items spread in a mess around in the room. These items include: colors palette, colors buckets, various brushes in different size. Add additional painting related tools. Colors: Use brownish and yellowish background colors Point of View: The point of view is diagonal for the top right side toward the bottom left direction.

And this time, we're almost there:

Now a final touch:

Main object: The main object is the artist Salvador Dali. Behavior: Salvador Dali stands and draws a drawing of cute puppy on a big laptop. Body language: Salvador Dali's face expression is very busy. Environment: The artist is located in an artist studio room. Items: There will be various painting related items spread in a mess around in the room. These items include: colors palette, colors buckets, various brushes in different size. Add additional painting related tools. Colors: Use brownish and yellowish background colors Point of View: The point of view is diagonal for the top right side toward the bottom left direction. We can see both Salvador Dali's face and the laptop

Monday, February 19, 2024

New Software Project: Steps and Architecture

In this post, we'll outline the steps needed to initiate a new software project. While these steps may not be applicable to all project types, they encompass details relevant to most software projects. Having a clear understanding of these steps from the outset can enhance our progress and help prevent costly mistakes that may arise if discovered later in the implementation stage.

Use a kubernetes architecture

Every contemporary software project should be built upon a Kubernetes architecture, offering numerous significant benefits. Firstly, Kubernetes provides a comprehensive framework for addressing various software challenges including scaling, upgrades, storage management, log handling, and load balancing. Moreover, once a project is configured for Kubernetes, it becomes compatible with both bare metal Kubernetes setups and any major cloud provider platforms.

Use an easy scaled DBMS

Traditionally, software relied on relational database management systems (DBMS) for their support of complex data processing via SQL. However, these systems have significant drawbacks, particularly in terms of cluster support and scalability. While major cloud providers offer built-in solutions for such DBMS, like AWS RDS, it's advisable to consider alternative, simpler DBMS for ongoing project activities, such as Redis or MongoDB. These lightweight DBMS provide easier cluster support and scalability.

Implement CI/CD from day 1

Integration of CI/CD should be prioritized right from the inception of your project rather than waiting until the deployment phase in production. By integrating Continuous Integration and Continuous Deployment (CI/CD) practices from day one, you establish a foundation for efficient project management and development. This comprehensive approach encompasses not only the seamless build, test, and deployment processes but also facilitates early detection of potential issues and ensures consistent quality throughout the project lifecycle. Embracing CI/CD early on sets the stage for streamlined development workflows and enables rapid iteration and delivery of software updates, ultimately enhancing project agility and resilience.

Enable desktop based system

While CI/CD remains a cornerstone of modern software development, it's equally imperative to conduct thorough local testing on your machine for expedited bug detection and feature iteration. The ability to build and run the project locally on your desktop is invaluable for accelerating development cycles and ensuring rapid feedback. To achieve this seamless local development experience, it's essential to develop scripts that automate the local build and deployment processes, streamlining testing and validation efforts. Kubernetes emerges as a highly recommended solution, providing a robust framework for orchestrating containerized applications and enabling efficient local deployment. Moreover, adopting Helm-based configuration not only simplifies management but also optimizes resource utilization, ensuring that the system operates efficiently even in local development environments. By prioritizing local testing and leveraging Kubernetes alongside Helm, developers can achieve faster development iterations, smoother testing workflows, and ultimately deliver higher-quality software solutions.

Identify performance bottlenecks

In the realm of software development, striking a delicate balance between crafting clear and simple code while ensuring optimal speed and performance is paramount. Generally, it's advisable to prioritize the development of code that is transparent and straightforward, deferring performance optimizations until later stages, typically after rigorous stress testing has been conducted. However, the role of the system architect is pivotal in this regard. It's incumbent upon them to proactively identify potential performance bottlenecks early in the project's lifecycle.

Anticipating these bottlenecks allows architects to integrate specialized treatments and optimizations into the system's design from the outset. By doing so, they lay a robust foundation that mitigates the risk of performance issues arising later in the development process. This proactive approach is critical because rectifying performance deficiencies at a later stage may necessitate extensive architectural revisions, potentially leading to project delays or, in the worst-case scenario, requiring a complete overhaul.

Therefore, while it's prudent to prioritize simplicity and clarity in code implementation, it's equally crucial for architects to engage in forward-thinking and strategic planning to address potential performance challenges preemptively. By doing this, development teams can minimize the risk of encountering significant hurdles during the stress testing phase, ensuring a smoother and more efficient project lifecycle overall.

Use project tests

See this post.

Use internal management GUI

It's important to differentiate between the end user graphical user interface (GUI) and the internal management GUI within a project. While the end user GUI focuses on providing a user-friendly interface for external users, the internal management GUI serves developers and system administrators, offering a robust set of functionalities without necessarily prioritizing visual aesthetics. This internal interface enables developers to efficiently manage and monitor the system, incorporating various tools and features tailored to their needs.

Moreover, the internal management GUI can include data visualization views, providing a clear depiction of the system's performance and success in achieving its objectives. These visualizations offer valuable insights into system metrics and status, facilitating informed decision-making and troubleshooting.

By distinguishing between these two interfaces and prioritizing functionality over visibility in the internal management GUI, development teams can streamline their workflow and enhance the efficiency of system management and monitoring processes. See this post for more.

Run Stress Tests

Regular stress testing, either periodically or as part of the CI/CD pipeline, is essential to gauge how well a system performs under anticipated loads. These stress tests should quantify the expected operational costs of running the system under predefined levels of load. To effectively analyze stress test results and gain actionable insights, it's crucial to utilize monitoring tools such as Prometheus and Grafana.

By integrating Prometheus and Grafana into the testing framework, teams can visualize key performance metrics and identify potential bottlenecks or areas for optimization. It's worth noting that relying solely on CPU and memory metrics may not provide a comprehensive understanding of system performance. Therefore, it's advisable to augment these metrics with custom Prometheus counters embedded within the microservices of the project. These custom counters can capture additional application-specific information and statuses, offering a more nuanced view of system behavior during stress testing.

By leveraging Prometheus and Grafana in conjunction with custom counters, teams can gain a holistic understanding of system performance under varying loads, enabling informed decision-making and proactive optimization efforts. See also this post for more details.

Documentation Practices

Effective documentation practices are essential for ensuring clarity, maintainability, and scalability throughout the software project lifecycle. Comprehensive documentation serves as a valuable resource for developers, stakeholders, and end-users alike, facilitating understanding and promoting successful project outcomes. Here are some key aspects to consider when establishing documentation practices:

Code Comments: Do not use code comments, unless you do something which is not clear, and has a hidden meaning.

API Documentation: Documenting APIs (Application Programming Interfaces) is crucial for enabling seamless integration and interoperability between different components of the system. API documentation should include endpoint descriptions, request/response formats, authentication methods, and usage examples to guide developers in utilizing the API effectively. A good way to handle this is using swagger.

Monday, February 12, 2024

NATS Monitoring Using Prometheus/Grafana

In a recent post we've setup a NATS cluster in kubernetes. In this post, we will review the steps to monitor NATS using Prometheus and Grafana.

The prometheus counters for NATS monitoring are provided using an exporter. This is implemented as a sidecar - an additional container in each pod which pulls the monitoring data from the local pod NATS container, and exposes an HTTP endpoint for prometheus.

The Exporter

To implement an exporter we create a minimal GO program to run the NATS exporter. The exporter uses localhost address, as it is part of the same pod.

package main

import (
  "github.com/nats-io/prometheus-nats-exporter/exporter"
)

func main() {
  opts := exporter.GetDefaultExporterOptions()
  opts.ListenAddress = "0.0.0.0"
  opts.ListenPort = 8080
  opts.GetVarz = true
  opts.NATSServerURL = "http://localhost:8222"

  natsExporter := exporter.NewExporter(opts)
  err := natsExporter.Start()
  if err != nil {
   panic(err)
  }
  natsExporter.WaitUntilDone()
}

The Sidecar

To use a sidecar, we add an additional container after the 2 existing ones.

The Scraping

To enable scraping of the pod by prometheus, we add the related annotations to the template section of the NATS statefulset.

template:
  metadata:
    annotations:
      prometheus.io/scrape: "true"
      prometheus.io/path: "/metrics"
      prometheus.io/port: "8080"

The Grafana Dashboard

Last, we import a pre-made example dashboard from here.

Notice that the dashboard displays absolute values, while in real life you will probably want to change it to use rate, for example, change this:

gnatsd_varz_in_bytes[1m]

to:

rate(gnatsd_varz_in_bytes[1m])

Monday, January 29, 2024

Visualized Data Driven Development

In this post we review a combination of the two development methods: Data Driven Development and Visualized Data. This methods are the key value of an idea conversion into a real product.

A. The Fairy Tale

Once upon a time, a great developer had an idea: "I add my software component in the middle of network traffic and make something really good with it!". And so, the great developer had implemented his idea, placed his software component right in the great spot somewhere along the network traffic, and everything worked!

Ahh.. No...

These kind of stories are fairy tales, and do not exist in real life. When we have an idea, we are not aware to the full implications of the implementation, and our plan to cope with theoretical data finds unexpected behavior on the real data. Trying to implement and deploy such a software component tends to quickly and disgraceful fail.

B. Data Driven Development

B.1. Get The Data

To prepare for real data, we need to develop our software side by side with real data starting from day one. This means that we need to get hold of real data. This is possible if we're part of a software organization which already has several products running out there in the cloud. We need to get a tap/mirror of the data and save it for our application. For example, we can enable saving of the real data in AWS S3 for a small section of the organization customers.

B.2. Secure The Data

Access to the real data has huge benefits, but also huge risks. Think about real network traffic data which contains credit card details, as well as medical information. We must use ALL of our means to secure access to the data.

B.3. Anonymization of the Data

Notice that this requires us to handle PII, and comply with the relevant country laws, such as GDPR.

One way to handle this is to anonymize the data before saving it. We can also save the data for a short period of time, and then delete it. This should be carefully handled, as a leak of customers' real data has a devastating implications for the software organization.

B.4. Simulation of Data Flow

Now that we have the data, and before starting to implement our software component, we should create a simulation wrapper. The simulation component reads the data from the saved location, and simulate running our software component as if it was actually running in the cloud in the production. This means that the simulation should stream the data into our software component.

B.5 Use the Same Source

An important thing to notice is that our simulation is a wrapper the the actual component source code, the same one running in the production. Do not mistake and have 2 sets of code for simulation and for production.

C. Visualized Data

Our software component does something (otherwise why does it exist?). It can for example periodically report an analysis of the data, or it can alter something in the data. Whatever it does, we need to be aware of this both as part of our simulation and as part of the production run. How should we check it is doing its job?

C.1. Logs - The Wrong Method

While logs might be fine for deep inspection of a problem, the logs are not suitable to check whether the software component does fulfill its purpose. There are many problems with the logs.

Do we need to scan though thousands of log lines to find the related log lines that represents the status?
Do we plan to keep the verbose logs in the production, and pay the price of storing them and searching in the logs?
Can we show the logs to a non-software-engineer and explain the result?

These are rhetorical questions. The answer is hell no! We can use logs to specify errors, and a periodic infrequent prints, but using logs to check our solution is a bad practice.

C.2. GUI - The Right Method

We should include a GUI to present the status of our solution, and not just the end result, but the entire processing. While small and cheap software components might be fine with ready made GUI such as ElasticSearch and Kibana or Prometheus and Grafana, in most cases it would be wiser to create our own GUI to present the software component status since these pre-made tools are great for the first days, but their flexibility is limited for a long term solution. This interactive GUI should include graphs, histograms, texts and whatever our status requires to be clearly displayed.

C.3. Save the Status

How do we provide the software component status? We dump once in a period: 1 second or 1 minute, depending on your needs. The dump is a simple JSON file that can we saved named by the data timestamp. This status is saved from the software component, always!

It means that no matter whether we're running from a simulation wrapper or from the production cloud, we get the same status JSON files that represent the software component status at a specific time.

C.4. Load the Status

To load the status, we need to... implement another component - the status loader. This is a backend application which reads the status JSON files for a specified period, analyzes and aggregates the statuses, and returns a response with the relevant graphs, histograms, and texts. This would probably be implemented as a HTTP REST based server.

C.5. Visualize the Status

And who send HTTP request to the status loader? The status visualization component. This is a JavaScript based application that present in an interactive and user friendly manner the responses from the status loader. We can easily implement such component, for example using react & redux. To display graphs and histograms, we can use some of the existing free libraries such as react-vis, google geomap, react-date-range, react-datepicker, react-dropdown, react-select, and many more.

D. Summary

To have our software component a first grade class product that can be used and maintained as a long term working solution we need to:

Get real data
Implement simulation wrapper to run the real software component code
Export the status from our software component
Implement a status loader
Implement a status visualization
Test our software component using the simulation and real data
Check results using the status loader and status visualization
Fix issues, and rerun until we have good enough results
Move to production for minimal deployment
Check results using the status loader and status visualization (yes, the same tools should be used for production!)
Fix issues, and rerun until we have good enough results

Monday, January 22, 2024

Using GeoChart from React Google Chart

The following is a sample of using Geo-Chart from React Google Chart. Notice that as long as using data per country, and not using markers, usage of this library is free, and does not require mapsApiKey.

To use the geo-chart, first install it:

npm -i react-google-charts

Next, create a react component to show the data:

import React from 'react'
import {Chart} from 'react-google-charts'

const isoCountries = {
  'AF': 'Afghanistan',
  'AX': 'Aland Islands',
  'AL': 'Albania',
  'DZ': 'Algeria',
  'AS': 'American Samoa',
  'AD': 'Andorra',
  'AO': 'Angola',
  'AI': 'Anguilla',
  'AQ': 'Antarctica',
  'AG': 'Antigua And Barbuda',
  'AR': 'Argentina',
  'AM': 'Armenia',
  'AW': 'Aruba',
  'AU': 'Australia',
  'AT': 'Austria',
  'AZ': 'Azerbaijan',
  'BS': 'Bahamas',
  'BH': 'Bahrain',
  'BD': 'Bangladesh',
  'BB': 'Barbados',
  'BY': 'Belarus',
  'BE': 'Belgium',
  'BZ': 'Belize',
  'BJ': 'Benin',
  'BM': 'Bermuda',
  'BT': 'Bhutan',
  'BO': 'Bolivia',
  'BA': 'Bosnia And Herzegovina',
  'BW': 'Botswana',
  'BV': 'Bouvet Island',
  'BR': 'Brazil',
  'IO': 'British Indian Ocean Territory',
  'BN': 'Brunei Darussalam',
  'BG': 'Bulgaria',
  'BF': 'Burkina Faso',
  'BI': 'Burundi',
  'KH': 'Cambodia',
  'CM': 'Cameroon',
  'CA': 'Canada',
  'CV': 'Cape Verde',
  'KY': 'Cayman Islands',
  'CF': 'Central African Republic',
  'TD': 'Chad',
  'CL': 'Chile',
  'CN': 'China',
  'CX': 'Christmas Island',
  'CC': 'Cocos (Keeling) Islands',
  'CO': 'Colombia',
  'KM': 'Comoros',
  'CG': 'Congo',
  'CD': 'Congo, Democratic Republic',
  'CK': 'Cook Islands',
  'CR': 'Costa Rica',
  'CI': 'Cote D\'Ivoire',
  'HR': 'Croatia',
  'CU': 'Cuba',
  'CY': 'Cyprus',
  'CZ': 'Czech Republic',
  'DK': 'Denmark',
  'DJ': 'Djibouti',
  'DM': 'Dominica',
  'DO': 'Dominican Republic',
  'EC': 'Ecuador',
  'EG': 'Egypt',
  'SV': 'El Salvador',
  'GQ': 'Equatorial Guinea',
  'ER': 'Eritrea',
  'EE': 'Estonia',
  'ET': 'Ethiopia',
  'FK': 'Falkland Islands (Malvinas)',
  'FO': 'Faroe Islands',
  'FJ': 'Fiji',
  'FI': 'Finland',
  'FR': 'France',
  'GF': 'French Guiana',
  'PF': 'French Polynesia',
  'TF': 'French Southern Territories',
  'GA': 'Gabon',
  'GM': 'Gambia',
  'GE': 'Georgia',
  'DE': 'Germany',
  'GH': 'Ghana',
  'GI': 'Gibraltar',
  'GR': 'Greece',
  'GL': 'Greenland',
  'GD': 'Grenada',
  'GP': 'Guadeloupe',
  'GU': 'Guam',
  'GT': 'Guatemala',
  'GG': 'Guernsey',
  'GN': 'Guinea',
  'GW': 'Guinea-Bissau',
  'GY': 'Guyana',
  'HT': 'Haiti',
  'HM': 'Heard Island & Mcdonald Islands',
  'VA': 'Holy See (Vatican City State)',
  'HN': 'Honduras',
  'HK': 'Hong Kong',
  'HU': 'Hungary',
  'IS': 'Iceland',
  'IN': 'India',
  'ID': 'Indonesia',
  'IR': 'Iran, Islamic Republic Of',
  'IQ': 'Iraq',
  'IE': 'Ireland',
  'IM': 'Isle Of Man',
  'IL': 'Israel',
  'IT': 'Italy',
  'JM': 'Jamaica',
  'JP': 'Japan',
  'JE': 'Jersey',
  'JO': 'Jordan',
  'KZ': 'Kazakhstan',
  'KE': 'Kenya',
  'KI': 'Kiribati',
  'KR': 'Korea',
  'KW': 'Kuwait',
  'KG': 'Kyrgyzstan',
  'LA': 'Lao People\'s Democratic Republic',
  'LV': 'Latvia',
  'LB': 'Lebanon',
  'LS': 'Lesotho',
  'LR': 'Liberia',
  'LY': 'Libyan Arab Jamahiriya',
  'LI': 'Liechtenstein',
  'LT': 'Lithuania',
  'LU': 'Luxembourg',
  'MO': 'Macao',
  'MK': 'Macedonia',
  'MG': 'Madagascar',
  'MW': 'Malawi',
  'MY': 'Malaysia',
  'MV': 'Maldives',
  'ML': 'Mali',
  'MT': 'Malta',
  'MH': 'Marshall Islands',
  'MQ': 'Martinique',
  'MR': 'Mauritania',
  'MU': 'Mauritius',
  'YT': 'Mayotte',
  'MX': 'Mexico',
  'FM': 'Micronesia, Federated States Of',
  'MD': 'Moldova',
  'MC': 'Monaco',
  'MN': 'Mongolia',
  'ME': 'Montenegro',
  'MS': 'Montserrat',
  'MA': 'Morocco',
  'MZ': 'Mozambique',
  'MM': 'Myanmar',
  'NA': 'Namibia',
  'NR': 'Nauru',
  'NP': 'Nepal',
  'NL': 'Netherlands',
  'AN': 'Netherlands Antilles',
  'NC': 'New Caledonia',
  'NZ': 'New Zealand',
  'NI': 'Nicaragua',
  'NE': 'Niger',
  'NG': 'Nigeria',
  'NU': 'Niue',
  'NF': 'Norfolk Island',
  'MP': 'Northern Mariana Islands',
  'NO': 'Norway',
  'OM': 'Oman',
  'PK': 'Pakistan',
  'PW': 'Palau',
  'PS': 'Palestinian Territory, Occupied',
  'PA': 'Panama',
  'PG': 'Papua New Guinea',
  'PY': 'Paraguay',
  'PE': 'Peru',
  'PH': 'Philippines',
  'PN': 'Pitcairn',
  'PL': 'Poland',
  'PT': 'Portugal',
  'PR': 'Puerto Rico',
  'QA': 'Qatar',
  'RE': 'Reunion',
  'RO': 'Romania',
  'RU': 'Russian Federation',
  'RW': 'Rwanda',
  'BL': 'Saint Barthelemy',
  'SH': 'Saint Helena',
  'KN': 'Saint Kitts And Nevis',
  'LC': 'Saint Lucia',
  'MF': 'Saint Martin',
  'PM': 'Saint Pierre And Miquelon',
  'VC': 'Saint Vincent And Grenadines',
  'WS': 'Samoa',
  'SM': 'San Marino',
  'ST': 'Sao Tome And Principe',
  'SA': 'Saudi Arabia',
  'SN': 'Senegal',
  'RS': 'Serbia',
  'SC': 'Seychelles',
  'SL': 'Sierra Leone',
  'SG': 'Singapore',
  'SK': 'Slovakia',
  'SI': 'Slovenia',
  'SB': 'Solomon Islands',
  'SO': 'Somalia',
  'ZA': 'South Africa',
  'GS': 'South Georgia And Sandwich Isl.',
  'ES': 'Spain',
  'LK': 'Sri Lanka',
  'SD': 'Sudan',
  'SR': 'Suriname',
  'SJ': 'Svalbard And Jan Mayen',
  'SZ': 'Swaziland',
  'SE': 'Sweden',
  'CH': 'Switzerland',
  'SY': 'Syrian Arab Republic',
  'TW': 'Taiwan',
  'TJ': 'Tajikistan',
  'TZ': 'Tanzania',
  'TH': 'Thailand',
  'TL': 'Timor-Leste',
  'TG': 'Togo',
  'TK': 'Tokelau',
  'TO': 'Tonga',
  'TT': 'Trinidad And Tobago',
  'TN': 'Tunisia',
  'TR': 'Turkey',
  'TM': 'Turkmenistan',
  'TC': 'Turks And Caicos Islands',
  'TV': 'Tuvalu',
  'UG': 'Uganda',
  'UA': 'Ukraine',
  'AE': 'United Arab Emirates',
  'GB': 'United Kingdom',
  'US': 'United States',
  'UM': 'United States Outlying Islands',
  'UY': 'Uruguay',
  'UZ': 'Uzbekistan',
  'VU': 'Vanuatu',
  'VE': 'Venezuela',
  'VN': 'Viet Nam',
  'VG': 'Virgin Islands, British',
  'VI': 'Virgin Islands, U.S.',
  'WF': 'Wallis And Futuna',
  'EH': 'Western Sahara',
  'YE': 'Yemen',
  'ZM': 'Zambia',
  'ZW': 'Zimbabwe',
}

function GuiGeoMap() {
  const population = {
    'IL': 12,
    'EG': 109,
    'US': 300,
  }
  const area = {
    'IL': 0.022,
    'EG': 1,
    'US': 9,
  }

  const data = [
    ['Country', 'Population', 'Area'],
    ['dummy', 0, 0],
  ]

  for (const country of Object.keys(population)) {
    const name = isoCountries[country]
    data.push([name, population[country], area[country]])
  }


  const options = {
    colorAxis: {colors: ['#0d8500', '#e31b23']},
    backgroundColor: '#81d4fa',
    datalessRegionColor: 'white',
    defaultColor: '#f5f5f5',
    legend: 'none',
  }
  return (
    <Chart
      chartType="GeoChart"
      width="100%"
      height="700px"
      data={data}
      options={options}
    />
  )
}

export default GuiGeoMap

Notice that I've added a "dummy" entry to make sure that even if we have a single country, the colors scale would start from zero.

Also, we have 2 values per each country: population and area.

The area is not affecting the color, but only shown as an additional data.