prysm-pulse/endtoend/evaluators/metrics.go

package evaluators

import (
	"context"
	"fmt"
	"io/ioutil"
	"net/http"
	"strconv"
	"strings"
	"time"

	ptypes "github.com/gogo/protobuf/types"
	"github.com/pkg/errors"
	eth "github.com/prysmaticlabs/ethereumapis/eth/v1alpha1"
	e2e "github.com/prysmaticlabs/prysm/endtoend/params"
	"github.com/prysmaticlabs/prysm/endtoend/types"
	"github.com/prysmaticlabs/prysm/shared/p2putils"
	"google.golang.org/grpc"
)

// MetricsCheck performs a check on metrics to make sure caches are functioning, and
// overall health is good. Not checking the first epoch so the sample size isn't too small.
var MetricsCheck = types.Evaluator{
	Name:       "metrics_check_epoch_%d",
	Policy:     afterNthEpoch(0),
	Evaluation: metricsTest,
}

type equalityTest struct {
	name  string
	topic string
	value int
}

type comparisonTest struct {
	name               string
	topic1             string
	topic2             string
	expectedComparison float64
}

var metricLessThanTests = []equalityTest{
	{
		name:  "memory usage",
		topic: "go_memstats_alloc_bytes",
		value: 100000000, // 100 Mb
	},
}

var metricComparisonTests = []comparisonTest{
	{
		name:               "beacon aggregate and proof",
		topic1:             "p2p_message_failed_validation_total{topic=\"/eth2/%x/beacon_aggregate_and_proof/ssz_snappy\"}",
		topic2:             "p2p_message_received_total{topic=\"/eth2/%x/beacon_aggregate_and_proof/ssz_snappy\"}",
		expectedComparison: 0.8,
	},
	{
		name:               "committee index 0 beacon attestation",
		topic1:             "p2p_message_failed_validation_total{topic=\"/eth2/%x/committee_index0_beacon_attestation/ssz_snappy\"}",
		topic2:             "p2p_message_received_total{topic=\"/eth2/%x/committee_index0_beacon_attestation/ssz_snappy\"}",
		expectedComparison: 0.15,
	},
	{
		name:               "committee index 1 beacon attestation",
		topic1:             "p2p_message_failed_validation_total{topic=\"/eth2/%x/committee_index1_beacon_attestation/ssz_snappy\"}",
		topic2:             "p2p_message_received_total{topic=\"/eth2/%x/committee_index1_beacon_attestation/ssz_snappy\"}",
		expectedComparison: 0.15,
	},
	{
		name:               "committee cache",
		topic1:             "committee_cache_miss",
		topic2:             "committee_cache_hit",
		expectedComparison: 0.01,
	},
	{
		name:               "hot state cache",
		topic1:             "hot_state_cache_miss",
		topic2:             "hot_state_cache_hit",
		expectedComparison: 0.01,
	},
}

func metricsTest(conns ...*grpc.ClientConn) error {
	for i := 0; i < len(conns); i++ {
		response, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", e2e.TestParams.BeaconNodeMetricsPort+i))
		if err != nil {
			return errors.Wrap(err, "failed to reach prometheus metrics page")
		}
		dataInBytes, err := ioutil.ReadAll(response.Body)
		if err != nil {
			return err
		}
		pageContent := string(dataInBytes)
		if err := response.Body.Close(); err != nil {
			return err
		}
		time.Sleep(connTimeDelay)

		genesis, err := eth.NewNodeClient(conns[i]).GetGenesis(context.Background(), &ptypes.Empty{})
		if err != nil {
			return err
		}
		forkDigest, err := p2putils.CreateForkDigest(time.Unix(genesis.GenesisTime.Seconds, 0), genesis.GenesisValidatorsRoot)
		if err != nil {
			return err
		}

		chainHead, err := eth.NewBeaconChainClient(conns[i]).GetChainHead(context.Background(), &ptypes.Empty{})
		if err != nil {
			return err
		}
		timeSlot, err := getValueOfTopic(pageContent, "beacon_clock_time_slot")
		if err != nil {
			return err
		}
		if chainHead.HeadSlot != uint64(timeSlot) {
			return fmt.Errorf("expected metrics slot to equal chain head slot, expected %d, received %d", chainHead.HeadSlot, timeSlot)
		}

		for _, test := range metricLessThanTests {
			topic := test.topic
			if strings.Contains(topic, "%x") {
				topic = fmt.Sprintf(topic, forkDigest)
			}
			if err := metricCheckLessThan(pageContent, topic, test.value); err != nil {
				return errors.Wrapf(err, "failed %s check", test.name)
			}
		}
		for _, test := range metricComparisonTests {
			topic1 := test.topic1
			if strings.Contains(topic1, "%x") {
				topic1 = fmt.Sprintf(topic1, forkDigest)
			}
			topic2 := test.topic2
			if strings.Contains(topic2, "%x") {
				topic2 = fmt.Sprintf(topic2, forkDigest)
			}
			if err := metricCheckComparison(pageContent, topic1, topic2, test.expectedComparison); err != nil {
				return err
			}
		}
	}
	return nil
}

func metricCheckLessThan(pageContent string, topic string, value int) error {
	topicValue, err := getValueOfTopic(pageContent, topic)
	if err != nil {
		return err
	}
	if topicValue >= value {
		return fmt.Errorf(
			"unexpected result for metric %s, expected less than %d, received %d",
			topic,
			value,
			topicValue,
		)
	}
	return nil
}

func metricCheckComparison(pageContent string, topic1 string, topic2 string, comparison float64) error {
	topic2Value, err := getValueOfTopic(pageContent, topic2)
	if err != nil {
		return err
	}
	topic1Value, err := getValueOfTopic(pageContent, topic1)
	// If we can't find the first topic (error metrics), then assume the test passes.
	if topic1Value == -1 && topic2Value != -1 {
		return nil
	}
	if err != nil {
		return err
	}
	topicComparison := float64(topic1Value) / float64(topic2Value)
	if topicComparison >= comparison {
		return fmt.Errorf(
			"unexpected result for comparison between metric %s and metric %s, expected comparison to be %.2f, received %.2f",
			topic1,
			topic2,
			comparison,
			topicComparison,
		)
	}
	return nil
}

func getValueOfTopic(pageContent string, topic string) (int, error) {
	// Adding a space to search exactly.
	startIdx := strings.LastIndex(pageContent, topic+" ")
	if startIdx == -1 {
		return -1, fmt.Errorf("did not find requested text %s in %s", topic, pageContent)
	}
	endOfTopic := startIdx + len(topic)
	// Adding 1 to skip the space after the topic name.
	startOfValue := endOfTopic + 1
	endOfValue := strings.Index(pageContent[startOfValue:], "\n")
	if endOfValue == -1 {
		return -1, fmt.Errorf("could not find next space in %s", pageContent[startOfValue:])
	}
	metricValue := pageContent[startOfValue : startOfValue+endOfValue]
	floatResult, err := strconv.ParseFloat(metricValue, 64)
	if err != nil {
		return -1, errors.Wrapf(err, "could not parse %s for int", metricValue)
	}
	return int(floatResult), nil
}
Add prometheus metrics test into E2E (#5673) * Progress on metrics tests * Progress on metrics test * Get metrics E2E working * Merge branch 'master' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Complete most of metrics tests * Change E2E polling to the middle of a slot, instead of at the start of the middle * Add metrics to all E2E * Remove extra types * Update endtoend/evaluators/metrics.go Co-Authored-By: Preston Van Loon <preston@prysmaticlabs.com> * Merge branch 'master' into e2e-metrics * Add more comments, address feedback * Merge branch 'e2e-metrics' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Fix build * Remove unneeded comment * Set E2E_EPOCHS back * Improve sync testing reliability * Remove metrics check from slashing * Improve time allotted to sync * Remove possibly flaky sync test change 2020-04-29 13:42:12 +00:00			`package evaluators`

			`import (`
			`"context"`
			`"fmt"`
			`"io/ioutil"`
			`"net/http"`
			`"strconv"`
			`"strings"`
			`"time"`

			`ptypes "github.com/gogo/protobuf/types"`
			`"github.com/pkg/errors"`
			`eth "github.com/prysmaticlabs/ethereumapis/eth/v1alpha1"`
			`e2e "github.com/prysmaticlabs/prysm/endtoend/params"`
			`"github.com/prysmaticlabs/prysm/endtoend/types"`
			`"github.com/prysmaticlabs/prysm/shared/p2putils"`
			`"google.golang.org/grpc"`
			`)`

			`// MetricsCheck performs a check on metrics to make sure caches are functioning, and`
			`// overall health is good. Not checking the first epoch so the sample size isn't too small.`
			`var MetricsCheck = types.Evaluator{`
			`Name: "metrics_check_epoch_%d",`
			`Policy: afterNthEpoch(0),`
			`Evaluation: metricsTest,`
			`}`

			`type equalityTest struct {`
			`name string`
			`topic string`
			`value int`
			`}`

			`type comparisonTest struct {`
			`name string`
			`topic1 string`
			`topic2 string`
			`expectedComparison float64`
			`}`

			`var metricLessThanTests = []equalityTest{`
			`{`
			`name: "memory usage",`
			`topic: "go_memstats_alloc_bytes",`
			`value: 100000000, // 100 Mb`
			`},`
			`}`

			`var metricComparisonTests = []comparisonTest{`
			`{`
			`name: "beacon aggregate and proof",`
			`topic1: "p2p_message_failed_validation_total{topic=\"/eth2/%x/beacon_aggregate_and_proof/ssz_snappy\"}",`
			`topic2: "p2p_message_received_total{topic=\"/eth2/%x/beacon_aggregate_and_proof/ssz_snappy\"}",`
			`expectedComparison: 0.8,`
			`},`
			`{`
			`name: "committee index 0 beacon attestation",`
			`topic1: "p2p_message_failed_validation_total{topic=\"/eth2/%x/committee_index0_beacon_attestation/ssz_snappy\"}",`
			`topic2: "p2p_message_received_total{topic=\"/eth2/%x/committee_index0_beacon_attestation/ssz_snappy\"}",`
Fix test setup and change participation min back to 100% (#5791) * Increase time for test to start * Try anothre fix * Set minimum participation back to 100% * Change bnode count to 2 * Give extra wiggle room for metrics * Remove comment 2020-05-09 20:11:10 +00:00			`expectedComparison: 0.15,`
Add prometheus metrics test into E2E (#5673) * Progress on metrics tests * Progress on metrics test * Get metrics E2E working * Merge branch 'master' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Complete most of metrics tests * Change E2E polling to the middle of a slot, instead of at the start of the middle * Add metrics to all E2E * Remove extra types * Update endtoend/evaluators/metrics.go Co-Authored-By: Preston Van Loon <preston@prysmaticlabs.com> * Merge branch 'master' into e2e-metrics * Add more comments, address feedback * Merge branch 'e2e-metrics' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Fix build * Remove unneeded comment * Set E2E_EPOCHS back * Improve sync testing reliability * Remove metrics check from slashing * Improve time allotted to sync * Remove possibly flaky sync test change 2020-04-29 13:42:12 +00:00			`},`
			`{`
			`name: "committee index 1 beacon attestation",`
			`topic1: "p2p_message_failed_validation_total{topic=\"/eth2/%x/committee_index1_beacon_attestation/ssz_snappy\"}",`
			`topic2: "p2p_message_received_total{topic=\"/eth2/%x/committee_index1_beacon_attestation/ssz_snappy\"}",`
Fix test setup and change participation min back to 100% (#5791) * Increase time for test to start * Try anothre fix * Set minimum participation back to 100% * Change bnode count to 2 * Give extra wiggle room for metrics * Remove comment 2020-05-09 20:11:10 +00:00			`expectedComparison: 0.15,`
Add prometheus metrics test into E2E (#5673) * Progress on metrics tests * Progress on metrics test * Get metrics E2E working * Merge branch 'master' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Complete most of metrics tests * Change E2E polling to the middle of a slot, instead of at the start of the middle * Add metrics to all E2E * Remove extra types * Update endtoend/evaluators/metrics.go Co-Authored-By: Preston Van Loon <preston@prysmaticlabs.com> * Merge branch 'master' into e2e-metrics * Add more comments, address feedback * Merge branch 'e2e-metrics' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Fix build * Remove unneeded comment * Set E2E_EPOCHS back * Improve sync testing reliability * Remove metrics check from slashing * Improve time allotted to sync * Remove possibly flaky sync test change 2020-04-29 13:42:12 +00:00			`},`
			`{`
			`name: "committee cache",`
			`topic1: "committee_cache_miss",`
			`topic2: "committee_cache_hit",`
			`expectedComparison: 0.01,`
			`},`
			`{`
			`name: "hot state cache",`
			`topic1: "hot_state_cache_miss",`
			`topic2: "hot_state_cache_hit",`
			`expectedComparison: 0.01,`
			`},`
			`}`

			`func metricsTest(conns ...*grpc.ClientConn) error {`
			`for i := 0; i < len(conns); i++ {`
			`response, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", e2e.TestParams.BeaconNodeMetricsPort+i))`
			`if err != nil {`
			`return errors.Wrap(err, "failed to reach prometheus metrics page")`
			`}`
			`dataInBytes, err := ioutil.ReadAll(response.Body)`
			`if err != nil {`
			`return err`
			`}`
			`pageContent := string(dataInBytes)`
			`if err := response.Body.Close(); err != nil {`
			`return err`
			`}`
E2E Improvements (#6091) * Some fixes * Merge branch 'master' into e2e-fixes * Add another small delay * Merge branch 'e2e-fixes' of github.com:prysmaticlabs/prysm into e2e-fixes * Remove genesis test, make normal e2e run longer * Gaz * more fixes * Merge branch 'master' into e2e-fixes * Merge refs/heads/master into e2e-fixes * Fix comment * Merge refs/heads/master into e2e-fixes 2020-06-03 18:44:13 +00:00			`time.Sleep(connTimeDelay)`
Add prometheus metrics test into E2E (#5673) * Progress on metrics tests * Progress on metrics test * Get metrics E2E working * Merge branch 'master' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Complete most of metrics tests * Change E2E polling to the middle of a slot, instead of at the start of the middle * Add metrics to all E2E * Remove extra types * Update endtoend/evaluators/metrics.go Co-Authored-By: Preston Van Loon <preston@prysmaticlabs.com> * Merge branch 'master' into e2e-metrics * Add more comments, address feedback * Merge branch 'e2e-metrics' of https://github.com/prysmaticlabs/prysm into e2e-metrics * Fix build * Remove unneeded comment * Set E2E_EPOCHS back * Improve sync testing reliability * Remove metrics check from slashing * Improve time allotted to sync * Remove possibly flaky sync test change 2020-04-29 13:42:12 +00:00
			`genesis, err := eth.NewNodeClient(conns[i]).GetGenesis(context.Background(), &ptypes.Empty{})`
			`if err != nil {`
			`return err`
			`}`
			`forkDigest, err := p2putils.CreateForkDigest(time.Unix(genesis.GenesisTime.Seconds, 0), genesis.GenesisValidatorsRoot)`
			`if err != nil {`
			`return err`
			`}`

			`chainHead, err := eth.NewBeaconChainClient(conns[i]).GetChainHead(context.Background(), &ptypes.Empty{})`
			`if err != nil {`
			`return err`
			`}`
			`timeSlot, err := getValueOfTopic(pageContent, "beacon_clock_time_slot")`
			`if err != nil {`
			`return err`
			`}`
			`if chainHead.HeadSlot != uint64(timeSlot) {`
			`return fmt.Errorf("expected metrics slot to equal chain head slot, expected %d, received %d", chainHead.HeadSlot, timeSlot)`
			`}`

			`for _, test := range metricLessThanTests {`
			`topic := test.topic`
			`if strings.Contains(topic, "%x") {`
			`topic = fmt.Sprintf(topic, forkDigest)`
			`}`
			`if err := metricCheckLessThan(pageContent, topic, test.value); err != nil {`
			`return errors.Wrapf(err, "failed %s check", test.name)`
			`}`
			`}`
			`for _, test := range metricComparisonTests {`
			`topic1 := test.topic1`
			`if strings.Contains(topic1, "%x") {`
			`topic1 = fmt.Sprintf(topic1, forkDigest)`
			`}`
			`topic2 := test.topic2`
			`if strings.Contains(topic2, "%x") {`
			`topic2 = fmt.Sprintf(topic2, forkDigest)`
			`}`
			`if err := metricCheckComparison(pageContent, topic1, topic2, test.expectedComparison); err != nil {`
			`return err`
			`}`
			`}`
			`}`
			`return nil`
			`}`

			`func metricCheckLessThan(pageContent string, topic string, value int) error {`
			`topicValue, err := getValueOfTopic(pageContent, topic)`
			`if err != nil {`
			`return err`
			`}`
			`if topicValue >= value {`
			`return fmt.Errorf(`
			`"unexpected result for metric %s, expected less than %d, received %d",`
			`topic,`
			`value,`
			`topicValue,`
			`)`
			`}`
			`return nil`
			`}`

			`func metricCheckComparison(pageContent string, topic1 string, topic2 string, comparison float64) error {`
			`topic2Value, err := getValueOfTopic(pageContent, topic2)`
			`if err != nil {`
			`return err`
			`}`
			`topic1Value, err := getValueOfTopic(pageContent, topic1)`
			`// If we can't find the first topic (error metrics), then assume the test passes.`
			`if topic1Value == -1 && topic2Value != -1 {`
			`return nil`
			`}`
			`if err != nil {`
			`return err`
			`}`
			`topicComparison := float64(topic1Value) / float64(topic2Value)`
			`if topicComparison >= comparison {`
			`return fmt.Errorf(`
			`"unexpected result for comparison between metric %s and metric %s, expected comparison to be %.2f, received %.2f",`
			`topic1,`
			`topic2,`
			`comparison,`
			`topicComparison,`
			`)`
			`}`
			`return nil`
			`}`

			`func getValueOfTopic(pageContent string, topic string) (int, error) {`
			`// Adding a space to search exactly.`
			`startIdx := strings.LastIndex(pageContent, topic+" ")`
			`if startIdx == -1 {`
			`return -1, fmt.Errorf("did not find requested text %s in %s", topic, pageContent)`
			`}`
			`endOfTopic := startIdx + len(topic)`
			`// Adding 1 to skip the space after the topic name.`
			`startOfValue := endOfTopic + 1`
			`endOfValue := strings.Index(pageContent[startOfValue:], "\n")`
			`if endOfValue == -1 {`
			`return -1, fmt.Errorf("could not find next space in %s", pageContent[startOfValue:])`
			`}`
			`metricValue := pageContent[startOfValue : startOfValue+endOfValue]`
			`floatResult, err := strconv.ParseFloat(metricValue, 64)`
			`if err != nil {`
			`return -1, errors.Wrapf(err, "could not parse %s for int", metricValue)`
			`}`
			`return int(floatResult), nil`
			`}`