FNIHCS

Finding Needles In Haystacks, and Chaotic Systems!

3 December 2020

Emmanuel T Odeke

Orijtech, Inc.

About myself

2

Simple photo upload microservice

3

Uber: microservices graph from July 2020 blog post

4

Netflix: 2014 Bruce Wong presentation

5

Netflix: 2016 Josh Evans presentation

6

Why/What is this talk about?

7

Fundamentals

8

Anatomy of a system

9

n-process system

10

Scenarios

11

Case study: Dgraph 2018

12

Thanksgiving 2018 with Dgraph

13

Later happiness and testimonial

Manish's tweet https://twitter.com/manishrjain/status/1090041366380302336

14

Impediments

15

Observability

16

Tracing & Metrics

17

Tracing aftermath

18

Metrics aftermath

19

Case study: Google-Cloud-Go 2019

20

High and fast bursts of telemetry/events

21

Sampling and isolating outliers/anomalies

22

Central Limit Theorem (CLT)

23

3 sigma rule

24

Code

                // Return anomalous spans.
                µPlus3σ := st.mean + 3*st.stddev
                µMinus3σ := st.mean - 3*st.stddev
                above3Sigma := make([]*spanDetails, 0, len(sds))
                below3Sigma := make([]*spanDetails, 0, len(sds))
                nonInteresting := make([]*spanDetails, 0, len(sds))
                for _, sd := range sds {
                        if sd.latencyMs >= µPlus3σ {
                                above3Sigma = append(above3Sigma, sd)
                        } else if sd.latencyMs <= µMinus3σ {
                                below3Sigma = append(below3Sigma, sd)
                        } else {
                                nonInteresting = append(nonInteresting, sd)
                        }
                }
25

Case study: Google-Cloud-Go 2020

26

storage: retry logic...continued: 2020

27

Finesse and mastery of your RPC systems and frameworks

28

Case study: Go runtime regression on Darwin: 2019

29

Continuous Profiling

30

CPU Profiling

package main
import "runtime/pprof"

func periodicallyCPUProfile(ctx context.Context, w io.Writer) error {
    for {
        if err := pprof.StartCPUProfile(w); err != nil {
            panic(err)
        }
        select {
        case <-ctx.Done():
            return ctx.Err()
        case <-time.After(profilingPeriod):
            pprof.StopCPUProfile()
        }
        <-time.After(profilingPausePeriod)
    }
}
func main() {
    go periodicallyCPUProfile(ctx, w)
    defer cancel()
    // The rest of the logic goes down below...
}
31

CPU Profiling

package main

import _ "net/http/pprof"

func main() {
    go func() {
        log.Println(http.ListenAndServe(":3338", nil))
    }()
    // Rest of your logic goes here
}

and then fetch a CPU profile

go tool pprof http://localhost:3338/debug/pprof/profile?seconds=30
32

CPU Profiling result

33

Memory profiling

package main
import "runtime/pprof"

func periodicallyMemoryProfile(ctx context.Context, w io.Writer) error {
    for {
        select {
        case <-ctx.Done():
            return ctx.Err()
        case <-time.After(profilingPeriod):
            if err := pprof.WriteHeapProfile(w); err != nil {
                panic(err)
            }
        }
        <-time.After(profilingPausePeriod)
    }
}
func main() {
    go periodicallyMemoryProfile(ctx, w)
    defer cancel()
    // The rest of the logic goes down below...
}
34

Memory Profiling

package main

import _ "net/http/pprof"

func main() {
    go func() {
        log.Println(http.ListenAndServe(":3338", nil))
    }()
    // Rest of your logic goes here
}

and then fetch a CPU profile

go tool pprof http://localhost:3338/debug/pprof/heap?seconds=30
35

Memory Profiling result

36

Advisory

37

Toolbox

38

References:

39

References

40

Thank you

Emmanuel T Odeke

Orijtech, Inc.

Observability, and infrastructure for high performance systems, and the cloud!