erigon-pulse/cmd/snapshots/sync/sync.go
Mark Holt 79ed8cad35
E2 snapshot uploading (#9056)
This change introduces additional processes to manage snapshot uploading
for E2 snapshots:

## erigon snapshots upload

The `snapshots uploader` command starts a version of erigon customized
for uploading snapshot files to
a remote location.  

It breaks the stage execution process after the senders stage and then
uses the snapshot stage to send
uploaded headers, bodies and (in the case of polygon) bor spans and
events to snapshot files. Because
this process avoids execution in run signifigantly faster than a
standard erigon configuration.

The uploader uses rclone to send seedable (100K or 500K blocks) to a
remote storage location specified
in the rclone config file.

The **uploader** is configured to minimize disk usage by doing the
following:

* It removes snapshots once they are loaded
* It aggressively prunes the database once entities are transferred to
snapshots

in addition to this it has the following performance related features:

* maximizes the workers allocated to snapshot processing to improve
throughput
* Can be started from scratch by downloading the latest snapshots from
the remote location to seed processing

## snapshots command

Is a stand alone command for managing remote snapshots it has the
following sub commands

* **cmp** - compare snapshots
* **copy** - copy snapshots
* **verify** - verify snapshots
* **manifest** - manage the manifest file in the root of remote snapshot
locations
* **torrent** - manage snapshot torrent files
2023-12-27 22:05:09 +00:00

445 lines
8.9 KiB
Go

package sync
import (
"bufio"
"context"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"time"
"github.com/anacrolix/torrent"
"github.com/anacrolix/torrent/metainfo"
"github.com/anacrolix/torrent/storage"
"github.com/c2h5oh/datasize"
"github.com/ledgerwatch/erigon-lib/chain/snapcfg"
"github.com/ledgerwatch/erigon-lib/common"
"github.com/ledgerwatch/erigon-lib/common/datadir"
"github.com/ledgerwatch/erigon-lib/downloader"
"github.com/ledgerwatch/erigon-lib/downloader/downloadercfg"
"github.com/ledgerwatch/erigon-lib/downloader/snaptype"
"github.com/ledgerwatch/erigon/cmd/downloader/downloadernat"
"github.com/ledgerwatch/erigon/cmd/utils"
"github.com/ledgerwatch/erigon/p2p/nat"
"github.com/ledgerwatch/erigon/params"
"github.com/urfave/cli/v2"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
)
type LType int
const (
TorrentFs LType = iota
LocalFs
RemoteFs
)
type Locator struct {
LType LType
Src string
Root string
Version uint8
Chain string
}
func (l Locator) String() string {
var val string
switch l.LType {
case TorrentFs:
val = "torrent"
case LocalFs:
val = l.Root
case RemoteFs:
val = l.Src + ":" + l.Root
}
if l.Version > 0 {
val += fmt.Sprint(":v", l.Version)
}
return val
}
var locatorExp, _ = regexp.Compile(`^(?:(\w+)\:)?([^\:]*)(?:\:(v\d+))?`)
var srcExp, _ = regexp.Compile(`^erigon-v\d+-snapshots-(.*)$`)
func ParseLocator(value string) (*Locator, error) {
if matches := locatorExp.FindStringSubmatch(value); len(matches) > 0 {
var loc Locator
switch {
case matches[1] == "torrent":
loc.LType = TorrentFs
if len(matches[2]) > 0 {
version, err := strconv.ParseUint(matches[2][1:], 10, 8)
if err != nil {
return nil, fmt.Errorf("can't parse version: %s: %w", matches[3], err)
}
loc.Version = uint8(version)
}
case len(matches[1]) > 0:
loc.LType = RemoteFs
loc.Src = matches[1]
loc.Root = matches[2]
if matches := srcExp.FindStringSubmatch(loc.Root); len(matches) > 1 {
loc.Chain = matches[1]
}
if len(matches[3]) > 0 {
version, err := strconv.ParseUint(matches[3][1:], 10, 8)
if err != nil {
return nil, fmt.Errorf("can't parse version: %s: %w", matches[3], err)
}
loc.Version = uint8(version)
}
default:
loc.LType = LocalFs
loc.Root = downloader.Clean(matches[2])
}
return &loc, nil
}
if path, err := filepath.Abs(value); err == nil {
return &Locator{
LType: LocalFs,
Root: path,
}, nil
}
return nil, fmt.Errorf("Invalid locator syntax")
}
type TorrentClient struct {
*torrent.Client
cfg *torrent.ClientConfig
}
func NewTorrentClient(cliCtx *cli.Context, chain string) (*TorrentClient, error) {
logger := Logger(cliCtx.Context)
tempDir := TempDir(cliCtx.Context)
torrentDir := filepath.Join(tempDir, "torrents", chain)
dirs := datadir.New(torrentDir)
webseedsList := common.CliString2Array(cliCtx.String(utils.WebSeedsFlag.Name))
if known, ok := snapcfg.KnownWebseeds[chain]; ok {
webseedsList = append(webseedsList, known...)
}
var downloadRate, uploadRate datasize.ByteSize
if err := downloadRate.UnmarshalText([]byte(cliCtx.String(utils.TorrentDownloadRateFlag.Name))); err != nil {
return nil, err
}
if err := uploadRate.UnmarshalText([]byte(cliCtx.String(utils.TorrentUploadRateFlag.Name))); err != nil {
return nil, err
}
logLevel, _, err := downloadercfg.Int2LogLevel(cliCtx.Int(utils.TorrentVerbosityFlag.Name))
if err != nil {
return nil, err
}
version := "erigon: " + params.VersionWithCommit(params.GitCommit)
cfg, err := downloadercfg.New(dirs, version, logLevel, downloadRate, uploadRate,
cliCtx.Int(utils.TorrentPortFlag.Name),
cliCtx.Int(utils.TorrentConnsPerFileFlag.Name), 0, nil, webseedsList, chain)
if err != nil {
return nil, err
}
err = os.RemoveAll(torrentDir)
if err != nil {
return nil, fmt.Errorf("can't clean torrent dir: %w", err)
}
if err := os.MkdirAll(torrentDir, 0755); err != nil {
return nil, err
}
cfg.ClientConfig.DataDir = torrentDir
cfg.ClientConfig.PieceHashersPerTorrent = 32 * runtime.NumCPU()
cfg.ClientConfig.DisableIPv6 = cliCtx.Bool(utils.DisableIPV6.Name)
cfg.ClientConfig.DisableIPv4 = cliCtx.Bool(utils.DisableIPV4.Name)
natif, err := nat.Parse(utils.NATFlag.Value)
if err != nil {
return nil, fmt.Errorf("invalid nat option %s: %w", utils.NATFlag.Value, err)
}
downloadernat.DoNat(natif, cfg.ClientConfig, logger)
cfg.ClientConfig.DefaultStorage = storage.NewMMap(torrentDir)
cli, err := torrent.NewClient(cfg.ClientConfig)
if err != nil {
return nil, fmt.Errorf("can't create torrent client: %w", err)
}
return &TorrentClient{cli, cfg.ClientConfig}, nil
}
type torrentSession struct {
cli *TorrentClient
items map[string]snapcfg.PreverifiedItem
}
type fileInfo struct {
info snapcfg.PreverifiedItem
}
func (fi *fileInfo) Name() string {
return fi.info.Name
}
func (fi *fileInfo) Size() int64 {
return 0
}
func (fi *fileInfo) Mode() fs.FileMode {
return fs.ModeIrregular
}
func (fi *fileInfo) ModTime() time.Time {
return time.Time{}
}
func (fi *fileInfo) IsDir() bool {
return false
}
type torrentInfo struct {
snapInfo *snaptype.FileInfo
hash string
}
func (i *torrentInfo) Version() uint8 {
if i.snapInfo != nil {
return i.snapInfo.Version
}
return 0
}
func (i *torrentInfo) From() uint64 {
if i.snapInfo != nil {
return i.snapInfo.From
}
return 0
}
func (i *torrentInfo) To() uint64 {
if i.snapInfo != nil {
return i.snapInfo.To
}
return 0
}
func (i *torrentInfo) Type() snaptype.Type {
if i.snapInfo != nil {
return i.snapInfo.T
}
return 0
}
func (i *torrentInfo) Hash() string {
return i.hash
}
func (fi *fileInfo) Sys() any {
info := torrentInfo{hash: fi.info.Hash}
if snapInfo, ok := snaptype.ParseFileName("", fi.Name()); ok {
info.snapInfo = &snapInfo
}
return &info
}
type dirEntry struct {
info *fileInfo
}
func (e dirEntry) Name() string {
return e.info.Name()
}
func (e dirEntry) IsDir() bool {
return e.info.IsDir()
}
func (e dirEntry) Type() fs.FileMode {
return fs.ModeIrregular
}
func (e dirEntry) Info() (fs.FileInfo, error) {
return e.info, nil
}
func (s *torrentSession) ReadRemoteDir(ctx context.Context, refresh bool) ([]fs.DirEntry, error) {
var entries = make([]fs.DirEntry, 0, len(s.items))
for _, info := range s.items {
entries = append(entries, &dirEntry{&fileInfo{info}})
}
slices.SortFunc(entries, func(a, b fs.DirEntry) int {
return strings.Compare(a.Name(), b.Name())
})
return entries, nil
}
func (s *torrentSession) LocalFsRoot() string {
return s.cli.cfg.DataDir
}
func (s *torrentSession) RemoteFsRoot() string {
return ""
}
func (s *torrentSession) Download(ctx context.Context, files ...string) error {
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(len(files))
for _, f := range files {
file := f
g.Go(func() error {
it, ok := s.items[file]
if !ok {
return fs.ErrNotExist
}
t, err := func() (*torrent.Torrent, error) {
infoHash := snaptype.Hex2InfoHash(it.Hash)
for _, t := range s.cli.Torrents() {
if t.Name() == file {
return t, nil
}
}
mi := &metainfo.MetaInfo{AnnounceList: downloader.Trackers}
magnet := mi.Magnet(&infoHash, &metainfo.Info{Name: file})
spec, err := torrent.TorrentSpecFromMagnetUri(magnet.String())
if err != nil {
return nil, err
}
spec.DisallowDataDownload = true
t, _, err := s.cli.AddTorrentSpec(spec)
if err != nil {
return nil, err
}
return t, nil
}()
if err != nil {
return err
}
select {
case <-ctx.Done():
return ctx.Err()
case <-t.GotInfo():
}
if !t.Complete.Bool() {
t.AllowDataDownload()
t.DownloadAll()
select {
case <-ctx.Done():
return ctx.Err()
case <-t.Complete.On():
}
}
closed := t.Closed()
t.Drop()
<-closed
return nil
})
}
return g.Wait()
}
func (s *torrentSession) Label() string {
return "torrents"
}
func NewTorrentSession(cli *TorrentClient, chain string) *torrentSession {
session := &torrentSession{cli, map[string]snapcfg.PreverifiedItem{}}
for _, it := range snapcfg.KnownCfg(chain, 0).Preverified {
session.items[it.Name] = it
}
return session
}
func DownloadManifest(ctx context.Context, session DownloadSession) ([]fs.DirEntry, error) {
if session, ok := session.(*downloader.RCloneSession); ok {
reader, err := session.Cat(ctx, "manifest.txt")
if err != nil {
return nil, err
}
var entries []fs.DirEntry
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
entries = append(entries, dirEntry{&fileInfo{snapcfg.PreverifiedItem{Name: scanner.Text()}}})
}
if err := scanner.Err(); err != nil {
return nil, err
}
return entries, nil
}
return nil, fmt.Errorf("not implemented for %T", session)
}
type DownloadSession interface {
Download(ctx context.Context, files ...string) error
ReadRemoteDir(ctx context.Context, refresh bool) ([]fs.DirEntry, error)
LocalFsRoot() string
RemoteFsRoot() string
Label() string
}