From 3fee7c3a71b3d77fa84520025082dc5254607440 Mon Sep 17 00:00:00 2001 From: Krzysztof Kowalczyk Date: Fri, 8 Oct 2021 21:56:44 +0000 Subject: [PATCH] add siser --- .github/workflows/biuld.yml | 22 ++ README.md | 8 + go.mod | 5 +- go.sum | 10 + server/readme.md | 5 + siser/read_write_test.go | 525 ++++++++++++++++++++++++++++++++++++ siser/reader.go | 193 +++++++++++++ siser/readme.md | 117 ++++++++ siser/record.go | 187 +++++++++++++ siser/util.go | 79 ++++++ siser/writer.go | 65 +++++ 11 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/biuld.yml create mode 100644 siser/read_write_test.go create mode 100644 siser/reader.go create mode 100644 siser/readme.md create mode 100644 siser/record.go create mode 100644 siser/util.go create mode 100644 siser/writer.go diff --git a/.github/workflows/biuld.yml b/.github/workflows/biuld.yml new file mode 100644 index 0000000..b816930 --- /dev/null +++ b/.github/workflows/biuld.yml @@ -0,0 +1,22 @@ +name: Build and test Go +on: push +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: 1.17 + + - name: Check out source code + uses: actions/checkout@v2 + + - name: Test + run: go test -v ./... + + - name: Staticcheck + run: | + # go get -u honnef.co/go/tools/cmd/staticcheck + # staticcheck ./... diff --git a/README.md b/README.md index 9b78389..fb5a03a 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ A bunch of Go packages that I use in multiple projects. + +An overview of packages: +* `filerotate` : implements a file you can write to and rotates on a schedule +(e.g. daily or hourly). I use it for log files +* `server` : a very specific abstraction over http server that allows +me to write http server that is dynamic during dev (e.g. generates .html +files from templates) and can be turned into a fully static website easily +* `siser` : Simple Serialization format diff --git a/go.mod b/go.mod index 9238ea3..9064152 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,7 @@ module github.com/kjk/common go 1.16 -require github.com/andybalholm/brotli v1.0.3 +require ( + github.com/andybalholm/brotli v1.0.3 + github.com/stretchr/testify v1.7.0 // indirect +) diff --git a/go.sum b/go.sum index 402458f..8f44029 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,12 @@ github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM= github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/server/readme.md b/server/readme.md index 6d778f4..8b548bd 100644 --- a/server/readme.md +++ b/server/readme.md @@ -1,3 +1,8 @@ +Package `github.com/kjk/common/server` is an abstraction +over http server in Go std lib. + +You won't like it, it's too different. + Those are useful snippets that use the server code for re-use. Copy & paste & modify for your purpose. diff --git a/siser/read_write_test.go b/siser/read_write_test.go new file mode 100644 index 0000000..19ea671 --- /dev/null +++ b/siser/read_write_test.go @@ -0,0 +1,525 @@ +package siser + +import ( + "bufio" + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "io/ioutil" + "math/rand" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +var ( + largeValue = "" + serializedJSON []byte + serializedSiser []byte +) + +func genLargeValue() { + s := "0123456789" + s += s // 20 + s += s // 40 + s += s // 80 + s += s // 160 + s += s // 320 + largeValue = s +} + +func init() { + genLargeValue() + genSerializedSiser() + genSerializedJSON() +} + +func timeDiff(t1 time.Time, t2 time.Time) time.Duration { + dur := t1.Sub(t2) + if dur < 0 { + dur = -dur + } + return dur +} + +func timeDiffLessThanMs(t1 time.Time, t2 time.Time) bool { + return timeDiff(t1, t2) < time.Millisecond +} + +func must(err error) { + if err != nil { + panic(err) + } +} + +var ( + testNo int +) + +// writes a valid test case for fuzzing corpus +// only if GEN_FUZZ_CORPUS env variable is set +func writeCorpus(d []byte) { + if os.Getenv("GEN_FUZZ_CORPUS") == "" { + return + } + dir := "fuzz_corpus" + err := os.MkdirAll(dir, 0755) + must(err) + testNo++ + name := fmt.Sprintf("test%d.txt", testNo) + path := filepath.Join(dir, name) + err = ioutil.WriteFile(path, d, 0644) + must(err) +} + +func testRoundTrip(t *testing.T, r *Record) string { + d := r.Marshal() + rec, err := UnmarshalRecord(d, nil) + assert.NoError(t, err) + rec2 := &Record{} + err = rec2.Unmarshal(d) + assert.NoError(t, err) + + // name and timestamp are not serialized here + assert.Equal(t, rec.Entries, r.Entries) + assert.Equal(t, rec2.Entries, r.Entries) + + testWriterRoundTrip(t, r) + writeCorpus(d) + + return string(d) +} + +func TestUnmarshalErrors(t *testing.T) { + invalidRecords := []string{ + "ha", + "ha\n", + "ha:\n", + "ha:_\n", + "ha:+32\nma", + "ha:+2\nmara", + "ha:+los\nma", + } + // test error paths in UnmarshalRecord + for _, s := range invalidRecords { + _, err := UnmarshalRecord([]byte(s), nil) + assert.Error(t, err, "s: '%s'", s) + } +} + +func testWriterRoundTrip(t *testing.T, r *Record) { + var buf bytes.Buffer + w := NewWriter(&buf) + n, err := w.WriteRecord(r) + assert.NoError(t, err) + d := buf.Bytes() + assert.Equal(t, len(d), n) + + buf2 := bytes.NewBuffer(d) + reader := NewReader(bufio.NewReader(buf2)) + ok := reader.ReadNextRecord() + assert.True(t, ok) + rec := reader.Record + assert.Equal(t, rec.Entries, r.Entries) + assert.Equal(t, rec.Name, r.Name) + + assert.True(t, r.Timestamp.IsZero() || timeDiffLessThanMs(rec.Timestamp, r.Timestamp), "rec.Timestamp: %s, r.Timestamp: %s, diff: %s", rec.Timestamp, r.Timestamp, timeDiff(rec.Timestamp, r.Timestamp)) +} + +type testRec struct { + s string + name string + pos int +} + +func mkTestRec(s string, name string) *testRec { + return &testRec{ + s: s, + name: name, + } +} + +func writeData(t *testing.T, tests []*testRec) *bytes.Buffer { + buf := &bytes.Buffer{} + // fixed time so that we can + unixNano := 5 * time.Second + tm := time.Unix(0, int64(unixNano)) + w := NewWriter(buf) + currPos := 0 + for _, test := range tests { + test.pos = currPos + n, err := w.Write([]byte(test.s), tm, test.name) + assert.NoError(t, err) + currPos += n + } + return buf +} + +func readAndVerifyData(t *testing.T, buf *bytes.Buffer, tests []*testRec) int64 { + unixNano := 5 * time.Second + tm := time.Unix(0, int64(unixNano)) + r := NewReader(bufio.NewReader(buf)) + n := 0 + for n < len(tests) && r.ReadNextData() { + test := tests[n] + assert.Equal(t, test.s, string(r.Data)) + assert.Equal(t, test.name, string(r.Name)) + assert.True(t, r.Timestamp.Equal(tm)) + expPos := int64(test.pos) + assert.Equal(t, expPos, r.CurrRecordPos) + n++ + } + assert.NoError(t, r.Err()) + return r.NextRecordPos +} + +func TestWriter(t *testing.T) { + tests := []*testRec{ + mkTestRec("hey\n", ""), + mkTestRec("ho", "with name"), + } + exp := `4 5000 +hey +2 5000 with name +ho +` + buf := writeData(t, tests) + d := buf.Bytes() + assert.Equal(t, exp, string(d)) + + readAndVerifyData(t, bytes.NewBuffer(d), tests) +} + +func TestWriterBug(t *testing.T) { + // we had a bug where file that starts with '\n' would cause problems + // because of the padding we add in writer but didn't properly + // account in reader + tests := []*testRec{ + // "foo" ends with newline, so we won't add it when + // writing a record + mkTestRec("foo\n", "foo.txt"), + } + + buf := writeData(t, tests) + expPos := int64(buf.Len()) + buf.WriteString("\nstarts with new line") + + buf2 := bytes.NewBuffer(buf.Bytes()) + gotPos := readAndVerifyData(t, buf2, tests) + assert.Equal(t, expPos, gotPos) +} + +func TestRecordSerializeSimple(t *testing.T) { + var r Record + + { + d := r.Marshal() + assert.Equal(t, 0, len(d)) + } + + r.Write("key", "val") + + { + v, ok := r.Get("key") + assert.True(t, ok) + assert.Equal(t, v, "val") + } + + { + v, ok := r.Get("Key") + assert.False(t, ok) + assert.Equal(t, v, "") + } + + s := testRoundTrip(t, &r) + assert.Equal(t, "key: val\n", s) +} + +func TestRecordSerializeSimple2(t *testing.T) { + var r Record + r.Write("k2", "a\nb") + s := testRoundTrip(t, &r) + assert.Equal(t, "k2:+3\na\nb\n", s) +} + +func TestWriterNoTimestamp(t *testing.T) { + tests := []struct { + data string + name string + exp string + }{ + { + data: "foo", + name: "name", + exp: "3 name\nfoo\n", + }, + { + data: "foo\n", + name: "", + exp: "4\nfoo\n", + }, + } + for _, test := range tests { + var buf bytes.Buffer + w := NewWriter(&buf) + w.NoTimestamp = true + var tm time.Time + _, err := w.Write([]byte(test.data), tm, test.name) + assert.Nil(t, err) + got := buf.String() + assert.Equal(t, test.exp, got) + + rbuf := bufio.NewReader(bytes.NewBufferString(got)) + r := NewReader(rbuf) + r.NoTimestamp = true + ok := r.ReadNextData() + assert.True(t, ok) + assert.Equal(t, string(r.Data), test.data) + assert.Equal(t, r.Name, test.name) + } +} + +func TestRecordSerializeSimple3(t *testing.T) { + var r Record + r.Write("long key", largeValue) + got := testRoundTrip(t, &r) + exp := fmt.Sprintf("long key:+%d\n%s\n", len(largeValue), largeValue) + assert.Equal(t, exp, got) +} + +func TestRecordSerializeSimple4(t *testing.T) { + var r Record + r.Write("k2", "a\nb") + r.Write("", "no name") + r.Write("bu", "gatti ") + r.Write("no value", "") + r.Write("bu", " gatti") + got := testRoundTrip(t, &r) + exp := `k2:+3 +a +b +: no name +bu: gatti +no value:+0 +bu: gatti +` + assert.Equal(t, exp, got) +} + +func TestMany(t *testing.T) { + testMany(t, "") + testMany(t, "named") +} + +func testMany(t *testing.T, name string) { + var buf bytes.Buffer + w := NewWriter(&buf) + + // we can't compare timestamp directly but as truncated to milliseconds + now := time.Now() + + rec := &Record{} + var positions []int64 + var currPos int64 + nRecs := 8 + for i := 0; i < nRecs; i++ { + rec.Reset() + rec.Name = name + rec.Timestamp = now + nRand := rand.Intn(1024) + rec.Write("counter", strconv.Itoa(i), "random", strconv.Itoa(nRand)) + if i%12 == 0 { + rec.Write("large", largeValue) + // test a case where large value is last in the record as well + // as being followed by another value + if rand.Intn(1024) > 512 { + rec.Write("after", "whatever") + } + } + n, err := w.WriteRecord(rec) + assert.Nil(t, err) + positions = append(positions, currPos) + currPos += int64(n) + } + + f := bufio.NewReader(bytes.NewBuffer(buf.Bytes())) + reader := NewReader(f) + i := 0 + for reader.ReadNextRecord() { + rec := reader.Record + recPos := reader.CurrRecordPos + assert.Equal(t, recPos, positions[i]) + if i < len(positions)-1 { + nextRecPos := reader.NextRecordPos + assert.Equal(t, nextRecPos, positions[i+1]) + } + + counter, ok := rec.Get("counter") + assert.True(t, ok) + exp := strconv.Itoa(i) + assert.Equal(t, exp, counter) + _, ok = rec.Get("random") + assert.True(t, ok) + assert.Equal(t, rec.Name, name) + assert.True(t, timeDiffLessThanMs(rec.Timestamp, now), "timestamp: %s, now: %s", rec.Timestamp, now) + i++ + } + assert.NoError(t, reader.Err()) + assert.Equal(t, nRecs, i) +} + +func TestWritePanics(t *testing.T) { + rec := &Record{} + assert.Panics(t, func() { rec.Write("foo") }, "should panic with even number of arguments") +} + +func TestIntStrLen(t *testing.T) { + numbers := []int{-1, 0, 1} + n1 := 1 + n2 := -1 + for i := 0; i < 10; i++ { + n1 = n1*10 + i + 1 + n2 = n2*10 - i - 1 + numbers = append(numbers, n1, n2) + } + for _, n := range numbers { + got := intStrLen(n) + exp := len(strconv.Itoa(n)) + assert.Equal(t, exp, got) + } +} + +func TestCrashes(t *testing.T) { + tests := []string{ + "/yBrZXk6Ky0yMAoA+f//+zA=", + "/QA6Ky03OQowMQAAMw==", + } + for _, tc := range tests { + d, err := base64.StdEncoding.DecodeString(tc) + assert.NoError(t, err) + _, _ = UnmarshalRecord(d, nil) + } +} + +var rec Record +var globalData []byte + +type testRecJSON struct { + URI string `json:"uri"` + Code int `json:"code"` + IP string `json:"ip"` + Duration time.Duration `json:"dur"` + When time.Time `json:"when"` + Size int `json:"size"` + UserAgent string `json:"ua"` + Referer string `json:"referer"` +} + +func BenchmarkSiserMarshal(b *testing.B) { + for n := 0; n < b.N; n++ { + rec.Reset() + rec.Write("uri", "/atom.xml") + rec.Write("code", strconv.Itoa(200)) + rec.Write("ip", "54.186.248.49") + durMs := float64(1.41) / float64(time.Millisecond) + durStr := strconv.FormatFloat(durMs, 'f', 2, 64) + rec.Write("dur", durStr) + rec.Write("when", time.Now().Format(time.RFC3339)) + rec.Write("size", strconv.Itoa(35286)) + rec.Write("ua", "Feedspot http://www.feedspot.com") + rec.Write("referer", "http://blog.kowalczyk.info/feed") + // assign to global to prevents optimizing the loop + globalData = rec.Marshal() + } +} + +func BenchmarkSiserMarshal2(b *testing.B) { + for n := 0; n < b.N; n++ { + rec.Reset() + durMs := float64(1.41) / float64(time.Millisecond) + durStr := strconv.FormatFloat(durMs, 'f', 2, 64) + rec.Write( + "uri", "/atom.xml", + "code", strconv.Itoa(200), + "ip", "54.186.248.49", + "dur", durStr, + "when", time.Now().Format(time.RFC3339), + "size", strconv.Itoa(35286), + "ua", "Feedspot http://www.feedspot.com", + "referer", "http://blog.kowalczyk.info/feed") + // assign to global to prevents optimizing the loop + globalData = rec.Marshal() + } +} + +func BenchmarkJSONMarshal(b *testing.B) { + for n := 0; n < b.N; n++ { + rec := testRecJSON{ + URI: "/atom.xml", + Code: 200, + IP: "54.186.248.49", + Duration: time.Microsecond * time.Duration(1410), + When: time.Now(), + Size: 35286, + UserAgent: "Feedspot http://www.feedspot.com", + Referer: "http://blog.kowalczyk.info/feed", + } + d, err := json.Marshal(rec) + panicIfErr(err) + // assign to global to prevents optimizing the loop + globalData = d + } +} + +func genSerializedSiser() { + var rec Record + rec.Write("uri", "/atom.xml") + rec.Write("code", strconv.Itoa(200)) + rec.Write("ip", "54.186.248.49") + durMs := float64(1.41) / float64(time.Millisecond) + durStr := strconv.FormatFloat(durMs, 'f', 2, 64) + rec.Write("dur", durStr) + rec.Write("when", time.Now().Format(time.RFC3339)) + rec.Write("size", strconv.Itoa(35286)) + rec.Write("ua", "Feedspot http://www.feedspot.com") + rec.Write("referer", "http://blog.kowalczyk.info/feed") + serializedSiser = rec.Marshal() +} + +func genSerializedJSON() { + rec := testRecJSON{ + URI: "/atom.xml", + Code: 200, + IP: "54.186.248.49", + Duration: time.Microsecond * time.Duration(1410), + When: time.Now(), + Size: 35286, + UserAgent: "Feedspot http://www.feedspot.com", + Referer: "http://blog.kowalczyk.info/feed", + } + d, err := json.Marshal(rec) + panicIfErr(err) + serializedJSON = d +} + +func BenchmarkSiserUnmarshal(b *testing.B) { + var rec Record + var err error + for n := 0; n < b.N; n++ { + err = rec.Unmarshal(serializedSiser) + panicIfErr(err) + } +} + +func BenchmarkJSONUnmarshal(b *testing.B) { + var rec testRecJSON + for n := 0; n < b.N; n++ { + err := json.Unmarshal(serializedJSON, &rec) + panicIfErr(err) + } +} diff --git a/siser/reader.go b/siser/reader.go new file mode 100644 index 0000000..f7a3821 --- /dev/null +++ b/siser/reader.go @@ -0,0 +1,193 @@ +package siser + +import ( + "bufio" + "bytes" + "fmt" + "io" + "strconv" + "time" +) + +// Reader is for reading (deserializing) records from a bufio.Reader +type Reader struct { + r *bufio.Reader + + // hints that the data was written without a timestamp + // (see Writer.NoTimestamp). We're permissive i.e. we'll + // read timestamp if it's written even if NoTimestamp is true + NoTimestamp bool + + // Record is available after ReadNextRecord(). + // It's over-written in next ReadNextRecord(). + Record *Record + + // Data / Name / Timestampe are available after ReadNextData. + // They are over-written in next ReadNextData. + Data []byte + Name string + Timestamp time.Time + + // position of the current record within the reader. + // We keep track of it so that callers can index records + // by offset and seek to it + CurrRecordPos int64 + + // position of the next record within the reader. + NextRecordPos int64 + + err error + + // true if reached end of file with io.EOF + done bool +} + +// NewReader creates a new reader +func NewReader(r *bufio.Reader) *Reader { + return &Reader{ + r: r, + Record: &Record{}, + } +} + +// Done returns true if we're finished reading from the reader +func (r *Reader) Done() bool { + return r.err != nil || r.done +} + +// ReadNextData reads next block from the reader, returns false +// when no more record. If returns false, check Err() to see +// if there were errors. +// After reading Data containst data, and Timestamp and (optional) Name +// contain meta-data +func (r *Reader) ReadNextData() bool { + if r.Done() { + return false + } + r.Name = "" + r.CurrRecordPos = r.NextRecordPos + + // read header in the format: + // "${size} ${timestamp_in_unix_epoch_ms} ${name}\n" + // or (if NoTimestamp): + // "${size} ${name}\n" + // ${name} is optional + hdr, err := r.r.ReadBytes('\n') + if err != nil { + if err == io.EOF { + r.done = true + } else { + r.err = err + } + return false + } + recSize := len(hdr) + rest := hdr[:len(hdr)-1] // remove '\n' from end + idx := bytes.IndexByte(rest, ' ') + var dataSize []byte + if idx == -1 { + if !r.NoTimestamp { + // with timestamp, we need at least 2 values separated by space + r.err = fmt.Errorf("unexpected header '%s'", string(hdr)) + return false + } + dataSize = rest + rest = nil + } else { + dataSize = rest[:idx] + rest = rest[idx+1:] + } + var name []byte + var timestamp []byte + idx = bytes.IndexByte(rest, ' ') + if idx == -1 { + if r.NoTimestamp { + // no timestamp, just name + name = rest + } else { + // no name, just timestamp + timestamp = rest + } + } else { + // timestamp and name + timestamp = rest[:idx] + name = rest[idx+1:] + } + + size, err := strconv.ParseInt(string(dataSize), 10, 64) + if err != nil { + r.err = fmt.Errorf("unexpected header '%s'", string(hdr)) + return false + } + + if len(timestamp) > 0 { + timeMs, err := strconv.ParseInt(string(timestamp), 10, 64) + if err != nil { + r.err = fmt.Errorf("unexpected header '%s'", string(hdr)) + return false + } + r.Timestamp = TimeFromUnixMillisecond(timeMs) + } + r.Name = string(name) + + // we try to re-use r.Data as long as it doesn't grow too much + // (limit to 1 MB) + if cap(r.Data) > 1024*1024 { + r.Data = nil + } + if size > int64(cap(r.Data)) { + r.Data = make([]byte, size) + } else { + // re-use existing buffer + r.Data = r.Data[:size] + } + n, err := io.ReadFull(r.r, r.Data) + if err != nil { + r.err = err + return false + } + panicIf(n != len(r.Data)) + recSize += n + + // account for the fact that for readability we might + // have padded data with '\n' + // same as needsNewline logic in Writer.Write + n = len(r.Data) + needsNewline := (n > 0) && (r.Data[n-1] != '\n') + if needsNewline { + _, err = r.r.Discard(1) + if err != nil { + r.err = err + return false + } + recSize++ + } + r.NextRecordPos += int64(recSize) + return true +} + +// ReadNextRecord reads a key / value record. +// Returns false if there are no more record. +// Check Err() for errors. +// After reading information is in Record (valid until +// next read). +func (r *Reader) ReadNextRecord() bool { + ok := r.ReadNextData() + if !ok { + return false + } + + _, r.err = UnmarshalRecord(r.Data, r.Record) + if r.err != nil { + return false + } + r.Record.Name = r.Name + r.Record.Timestamp = r.Timestamp + return true +} + +// Err returns error from last Read. We swallow io.EOF to make it easier +// to use +func (r *Reader) Err() error { + return r.err +} diff --git a/siser/readme.md b/siser/readme.md new file mode 100644 index 0000000..47630dc --- /dev/null +++ b/siser/readme.md @@ -0,0 +1,117 @@ +# siser + +Package `siser` is a Simple Serialization library for Go + +Imagine you want to write many records of somewhat structured data +to a file. Think of it as structured logging. + +You could use csv format, but csv values are identified by a position, +not name. They are also hard to read. + +You could serialize as json and write one line per json record but +json isn't great for human readability (imagine you `tail -f` a log +file with json records). + +This library is meant to be a middle ground: +* you can serialize arbitrary records with multiple key/value pairs +* the output is human-readable +* it's designed to be efficient and simple to use + +## API usage + +Imagine you want log basic info about http requests. + +```go +func createWriter() (*siser.Writer, error) { + f, err := os.Create("http_access.log") + if err != nil { + return nil, err + } + w := siser.NewWriter(f) + return w, nil +} + +func logHTTPRequest(w *siser.Writer, url string, ipAddr string, statusCode int) error { + var rec siser.Record + // you can append multiple key/value pairs at once + rec.Write("url", url, "ipaddr", ipAddr) + // or assemble with multiple calls + rec.Writes("code", strconv.Itoa(statusCode)) + _, err := w.WriteRecord(&rec) + return err +} +``` + +The data will be written to writer underlying `siser.Writer` as: +``` +61 1553488435903 httplog +url: https://blog.kowalczyk.info +ipaddr: 10.0.0.1 +code: 200 +``` + +Here's what and why: +* `61` is the size of the data. This allows us to read the exact number of bytes in the record +* `1553488435903` is a timestamp which is Unix epoch time in milliseconds (more precision than standard Unix time which is in seconds) +* `httplog` is optional name of the record. This allows you to easily write multiple types of records to a file + +To read all records from the file: +```go +f, err := os.Open("http_access.log") +fatalIfErr(err) +defer f.Close() +reader := siser.NewReader(f) +for reader.ReadNextRecord() { + rec := r.Record + name := rec.Name // "httplog" + timestamp := rec.Timestamp + code, ok := rec.Get("code") + // get rest of values and and do something with them +} +fatalIfErr(rec.Err()) +``` + +## Usage scenarios + +I use `siser` for in my web services for 2 use cases: + +* logging to help in debugging issues after they happen +* implementing poor-man's analytics + +Logging for debugging adds a little bit more structure over +ad hoc logging. I can add some meta-data to log entries +and in addition to reading the logs I can quickly write +programs that filter the logs. For example if I add serving time +to http request log I could easily write a program that shows +requests that take over 1 second to serve. + +Another one is poor-man's analytics. For example, if you're building +a web service that converts .png file to .ico file, it would be +good to know daily statistics about how many files were converted, +how much time an average conversion takes etc. + +## Performance and implementation notes + +Some implementation decisions were made with performance in mind. + +Given key/value nature of the record, an easy choice would be to use map[string]string as source to encode/decode functions. + +However `[]string` is more efficient than a `map`. Additionally, a slice can be reused across multiple records. We can clear it by setting the size to zero and reuse the underlying array. A map would require allocating a new instance for each record, which would create a lot of work for garbage collector. + +When serializing, you need to use `Reset` method to get the benefit of efficient re-use of the `Record`. + +When reading and deserializing records, `siser.Reader` uses this optimization internally. + +The format avoids the need for escaping keys and values, which helps in making encoding/decoding fast. + +How does that play out in real life? I wrote a benchmark comparing siser vs. json.Marshal. It’s about 30% faster: + +``` +$ go test -bench=. +BenchmarkSiserMarshal-8 1000000 1903 ns/op +BenchmarkJSONMarshal-8 500000 2905 ns/op +``` + +The format is binary-safe and works for serializing large values e.g. you can use png image as value. + +It’s also very easy to implement in any language. diff --git a/siser/record.go b/siser/record.go new file mode 100644 index 0000000..1ad49c8 --- /dev/null +++ b/siser/record.go @@ -0,0 +1,187 @@ +package siser + +import ( + "bytes" + "fmt" + "strconv" + "strings" + "time" +) + +/* +Serialize/Deserialize array of key/value pairs in a format that is easy +to serialize/parse and human-readable. + +The basic format is line-oriented: "key: value\n" + +When value is long (> 120 chars) or has \n in it, we serialize it as: +key:+$len\n +value\n +*/ + +type Entry struct { + Key string + Value string +} + +// Record represents list of key/value pairs that can +// be serialized/deserialized +type Record struct { + // Entries are available after Unmarshal/UnmarshalRecord + Entries []Entry + buf strings.Builder + Name string + // when writing, if not provided we use current time + Timestamp time.Time +} + +func (r *Record) appendKeyVal(key, val string) { + e := Entry{ + Key: key, + Value: val, + } + r.Entries = append(r.Entries, e) +} + +// Write writes key/value pairs to a record. +// After you write all key/value pairs, call Marshal() +// to get serialized value (valid until next call to Reset()) +func (r *Record) Write(args ...string) { + n := len(args) + if n == 0 || n%2 != 0 { + panic(fmt.Sprintf("Invalid number of args: %d", len(args))) + } + for i := 0; i < n; i += 2 { + r.marshalKeyVal(args[i], args[i+1]) + // TODO: this is for api compat with older version + // remove it and fix the tests + r.appendKeyVal(args[i], args[i+1]) + } +} + +// Reset makes it easy to re-use Record (as opposed to allocating a new one +// each time) +func (r *Record) Reset() { + if r.Entries != nil { + r.Entries = r.Entries[0:0] + } + r.Name = "" + var t time.Time + r.Timestamp = t + r.buf.Reset() +} + +// Get returns a value for a given key +func (r *Record) Get(key string) (string, bool) { + for _, e := range r.Entries { + if e.Key == key { + return e.Value, true + } + } + return "", false +} + +func nonEmptyEndsWithNewline(s string) bool { + n := len(s) + return n == 0 || s[n-1] == '\n' +} + +// return true if value needs to be serialized in long, +// size-prefixed format +func needsLongFormat(s string) bool { + return len(s) == 0 || len(s) > 120 || !serializableOnLine(s) +} + +func (r *Record) marshalKeyVal(key, val string) { + r.buf.WriteString(key) + isLong := needsLongFormat(val) + if isLong { + r.buf.WriteString(":+") + slen := strconv.Itoa(len(val)) + r.buf.WriteString(slen) + r.buf.WriteByte('\n') + r.buf.WriteString(val) + // for readability: ensure a newline at the end so + // that header record always appears on new line + if !nonEmptyEndsWithNewline(val) { + r.buf.WriteByte('\n') + } + } else { + r.buf.WriteString(": ") + r.buf.WriteString(val) + r.buf.WriteByte('\n') + } +} + +// Marshal converts record to bytes +func (r *Record) Marshal() []byte { + return []byte(r.buf.String()) +} + +// UnmarshalRecord unmarshall record as marshalled with Record.Marshal +// For efficiency re-uses record r. If r is nil, will allocate new record. +func UnmarshalRecord(d []byte, r *Record) (*Record, error) { + if r == nil { + r = &Record{} + } else { + r.Reset() + } + + for len(d) > 0 { + idx := bytes.IndexByte(d, '\n') + if idx == -1 { + return nil, fmt.Errorf("missing '\n' marking end of header in '%s'", string(d)) + } + line := d[:idx] + d = d[idx+1:] + idx = bytes.IndexByte(line, ':') + if idx == -1 { + return nil, fmt.Errorf("line in unrecognized format: '%s'", line) + } + key := line[:idx] + val := line[idx+1:] + // at this point val must be at least one character (' ' or '+') + if len(val) < 1 { + return nil, fmt.Errorf("line in unrecognized format: '%s'", line) + } + kind := val[0] + val = val[1:] + if kind == ' ' { + r.appendKeyVal(string(key), string(val)) + continue + } + + if kind != '+' { + return nil, fmt.Errorf("line in unrecognized format: '%s'", line) + } + + n, err := strconv.Atoi(string(val)) + if err != nil { + return nil, err + } + if n < 0 { + return nil, fmt.Errorf("negative length %d of data", n) + } + if n > len(d) { + return nil, fmt.Errorf("length of value %d greater than remaining data of size %d", n, len(d)) + } + val = d[:n] + d = d[n:] + // encoder might put optional newline + if len(d) > 0 && d[0] == '\n' { + d = d[1:] + } + r.appendKeyVal(string(key), string(val)) + } + return r, nil +} + +// Unmarshal resets record and decodes data as created by Marshal +// into it. +func (r *Record) Unmarshal(d []byte) error { + rec, err := UnmarshalRecord(d, r) + panicIf(err == nil && rec == nil, "should return err or rec") + panicIf(err != nil && rec != nil, "if error, rec should be nil") + panicIf(rec != nil && rec != r, "if returned rec, must be same as r") + return err +} diff --git a/siser/util.go b/siser/util.go new file mode 100644 index 0000000..1909df8 --- /dev/null +++ b/siser/util.go @@ -0,0 +1,79 @@ +package siser + +import ( + "fmt" + "time" +) + +func fmtArgs(args ...interface{}) string { + if len(args) == 0 { + return "" + } + format := args[0].(string) + if len(args) == 1 { + return format + } + return fmt.Sprintf(format, args[1:]...) +} + +func panicWithMsg(defaultMsg string, args ...interface{}) { + s := fmtArgs(args...) + if s == "" { + s = defaultMsg + } + panic(s) +} + +func panicIfErr(err error, args ...interface{}) { + if err == nil { + return + } + panicWithMsg(err.Error(), args...) +} + +func panicIf(cond bool, args ...interface{}) { + if !cond { + return + } + panicWithMsg("fatalIf: condition failed", args...) +} + +// intStrLen calculates how long n would be when converted to a string +// i.e. equivalent of len(strconv.Itoa(n)) but faster +// Note: not used +func intStrLen(n int) int { + l := 1 // count the last digit here + if n < 0 { + n = -n + l = 2 + } + for n > 9 { + l++ + n = n / 10 + } + return l +} + +func serializableOnLine(s string) bool { + n := len(s) + for i := 0; i < n; i++ { + b := s[i] + if b < 32 || b > 127 { + return false + } + } + return true +} + +// TimeToUnixMillisecond converts t into Unix epoch time in milliseconds. +// That's because seconds is not enough precision and nanoseconds is too much. +func TimeToUnixMillisecond(t time.Time) int64 { + n := t.UnixNano() + return n / 1e6 +} + +// TimeFromUnixMillisecond returns time from Unix epoch time in milliseconds. +func TimeFromUnixMillisecond(unixMs int64) time.Time { + nano := unixMs * 1e6 + return time.Unix(0, nano) +} diff --git a/siser/writer.go b/siser/writer.go new file mode 100644 index 0000000..b627acb --- /dev/null +++ b/siser/writer.go @@ -0,0 +1,65 @@ +package siser + +import ( + "io" + "strconv" + "time" +) + +// Writer writes records to in a structured format +type Writer struct { + w io.Writer + // NoTimestamp disables writing timestamp, which + // makes serialized data not depend on when they were written + NoTimestamp bool +} + +// NewWriter creates a writer +func NewWriter(w io.Writer) *Writer { + return &Writer{ + w: w, + } +} + +// WriteRecord writes a record in a specified format +func (w *Writer) WriteRecord(r *Record) (int, error) { + d := r.Marshal() + return w.Write(d, r.Timestamp, r.Name) +} + +// Write writes a block of data with optional timestamp and name. +// Returns number of bytes written (length of d + lenght of metadata) +// and an error +func (w *Writer) Write(d []byte, t time.Time, name string) (int, error) { + var hdr string + if w.NoTimestamp { + hdr = strconv.Itoa(len(d)) + } else { + if t.IsZero() { + t = time.Now() + } + ms := TimeToUnixMillisecond(t) + hdr = strconv.Itoa(len(d)) + " " + strconv.FormatInt(ms, 10) + } + if name != "" { + hdr += " " + name + } + hdr += "\n" + n := len(d) + bufSize := len(hdr) + n + // for readability, if the record doesn't end with newline, + // we add one at the end. Makes decoding a bit harder but + // not by much. + needsNewline := (n > 0) && (d[n-1] != '\n') + if needsNewline { + bufSize += 1 + } + + buf := make([]byte, 0, bufSize) + buf = append(buf, hdr...) + buf = append(buf, d...) + if needsNewline { + buf = append(buf, '\n') + } + return w.w.Write(buf) +}