core/rawdb: fix datarace in freezer (#22728)

The Append / truncate operations were racy. When a datafile reaches 2Gb, a new file is needed. For this operation, we require a writelock, which is not needed in the 99.99% of all cases where the data does fit in the current head-file.

This transition from readlock to writelock was incorrect, and as the readlock was released, a truncate operation could slip in between, and truncate the data. This would have been fine, however, the Append operation continued writing as if no truncation had occurred, e.g writing item 5 where item 0 should reside.

This PR changes the behaviour, so that if when we run into the situation that a new file is needed, it aborts, and retries, this time with a writelock.

The outcome of the situation described above, running on this PR, would instead be that the Append operation exits with a failure.
This commit is contained in:
Martin Holst Swende 2021-04-26 18:19:07 +02:00 committed by GitHub
parent 34f3c9539b
commit 9b99e3dfe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 117 additions and 41 deletions

@ -465,35 +465,59 @@ func (t *freezerTable) releaseFilesAfter(num uint32, remove bool) {
// Note, this method will *not* flush any data to disk so be sure to explicitly
// fsync before irreversibly deleting data from the database.
func (t *freezerTable) Append(item uint64, blob []byte) error {
// Read lock prevents competition with truncate
t.lock.RLock()
// Ensure the table is still accessible
if t.index == nil || t.head == nil {
t.lock.RUnlock()
return errClosed
}
// Ensure only the next item can be written, nothing else
if atomic.LoadUint64(&t.items) != item {
t.lock.RUnlock()
return fmt.Errorf("appending unexpected item: want %d, have %d", t.items, item)
}
// Encode the blob and write it into the data file
// Encode the blob before the lock portion
if !t.noCompression {
blob = snappy.Encode(nil, blob)
}
bLen := uint32(len(blob))
// Read lock prevents competition with truncate
retry, err := t.append(item, blob, false)
if err != nil {
return err
}
if retry {
// Read lock was insufficient, retry with a writelock
_, err = t.append(item, blob, true)
}
return err
}
// append injects a binary blob at the end of the freezer table.
// Normally, inserts do not require holding the write-lock, so it should be invoked with 'wlock' set to
// false.
// However, if the data will grown the current file out of bounds, then this
// method will return 'true, nil', indicating that the caller should retry, this time
// with 'wlock' set to true.
func (t *freezerTable) append(item uint64, encodedBlob []byte, wlock bool) (bool, error) {
if wlock {
t.lock.Lock()
defer t.lock.Unlock()
} else {
t.lock.RLock()
defer t.lock.RUnlock()
}
// Ensure the table is still accessible
if t.index == nil || t.head == nil {
return false, errClosed
}
// Ensure only the next item can be written, nothing else
if atomic.LoadUint64(&t.items) != item {
return false, fmt.Errorf("appending unexpected item: want %d, have %d", t.items, item)
}
bLen := uint32(len(encodedBlob))
if t.headBytes+bLen < bLen ||
t.headBytes+bLen > t.maxFileSize {
// we need a new file, writing would overflow
t.lock.RUnlock()
t.lock.Lock()
// Writing would overflow, so we need to open a new data file.
// If we don't already hold the writelock, abort and let the caller
// invoke this method a second time.
if !wlock {
return true, nil
}
nextID := atomic.LoadUint32(&t.headId) + 1
// We open the next file in truncated mode -- if this file already
// exists, we need to start over from scratch on it
newHead, err := t.openFile(nextID, openFreezerFileTruncated)
if err != nil {
t.lock.Unlock()
return err
return false, err
}
// Close old file, and reopen in RDONLY mode
t.releaseFile(t.headId)
@ -503,13 +527,9 @@ func (t *freezerTable) Append(item uint64, blob []byte) error {
t.head = newHead
atomic.StoreUint32(&t.headBytes, 0)
atomic.StoreUint32(&t.headId, nextID)
t.lock.Unlock()
t.lock.RLock()
}
defer t.lock.RUnlock()
if _, err := t.head.Write(blob); err != nil {
return err
if _, err := t.head.Write(encodedBlob); err != nil {
return false, err
}
newOffset := atomic.AddUint32(&t.headBytes, bLen)
idx := indexEntry{
@ -523,7 +543,7 @@ func (t *freezerTable) Append(item uint64, blob []byte) error {
t.sizeGauge.Inc(int64(bLen + indexEntrySize))
atomic.AddUint64(&t.items, 1)
return nil
return false, nil
}
// getBounds returns the indexes for the item
@ -562,44 +582,48 @@ func (t *freezerTable) getBounds(item uint64) (uint32, uint32, uint32, error) {
// Retrieve looks up the data offset of an item with the given number and retrieves
// the raw binary blob from the data file.
func (t *freezerTable) Retrieve(item uint64) ([]byte, error) {
blob, err := t.retrieve(item)
if err != nil {
return nil, err
}
if t.noCompression {
return blob, nil
}
return snappy.Decode(nil, blob)
}
// retrieve looks up the data offset of an item with the given number and retrieves
// the raw binary blob from the data file. OBS! This method does not decode
// compressed data.
func (t *freezerTable) retrieve(item uint64) ([]byte, error) {
t.lock.RLock()
defer t.lock.RUnlock()
// Ensure the table and the item is accessible
if t.index == nil || t.head == nil {
t.lock.RUnlock()
return nil, errClosed
}
if atomic.LoadUint64(&t.items) <= item {
t.lock.RUnlock()
return nil, errOutOfBounds
}
// Ensure the item was not deleted from the tail either
if uint64(t.itemOffset) > item {
t.lock.RUnlock()
return nil, errOutOfBounds
}
startOffset, endOffset, filenum, err := t.getBounds(item - uint64(t.itemOffset))
if err != nil {
t.lock.RUnlock()
return nil, err
}
dataFile, exist := t.files[filenum]
if !exist {
t.lock.RUnlock()
return nil, fmt.Errorf("missing data file %d", filenum)
}
// Retrieve the data itself, decompress and return
blob := make([]byte, endOffset-startOffset)
if _, err := dataFile.ReadAt(blob, int64(startOffset)); err != nil {
t.lock.RUnlock()
return nil, err
}
t.lock.RUnlock()
t.readMeter.Mark(int64(len(blob) + 2*indexEntrySize))
if t.noCompression {
return blob, nil
}
return snappy.Decode(nil, blob)
return blob, nil
}
// has returns an indicator whether the specified number data

@ -18,10 +18,13 @@ package rawdb
import (
"bytes"
"encoding/binary"
"fmt"
"io/ioutil"
"math/rand"
"os"
"path/filepath"
"sync"
"testing"
"time"
@ -637,6 +640,55 @@ func TestOffset(t *testing.T) {
// 1. have data files d0, d1, d2, d3
// 2. remove d2,d3
//
// However, all 'normal' failure modes arising due to failing to sync() or save a file should be
// handled already, and the case described above can only (?) happen if an external process/user
// deletes files from the filesystem.
// However, all 'normal' failure modes arising due to failing to sync() or save a file
// should be handled already, and the case described above can only (?) happen if an
// external process/user deletes files from the filesystem.
// TestAppendTruncateParallel is a test to check if the Append/truncate operations are
// racy.
//
// The reason why it's not a regular fuzzer, within tests/fuzzers, is that it is dependent
// on timing rather than 'clever' input -- there's no determinism.
func TestAppendTruncateParallel(t *testing.T) {
dir, err := ioutil.TempDir("", "freezer")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dir)
f, err := newCustomTable(dir, "tmp", metrics.NilMeter{}, metrics.NilMeter{}, metrics.NilGauge{}, 8, true)
if err != nil {
t.Fatal(err)
}
fill := func(mark uint64) []byte {
data := make([]byte, 8)
binary.LittleEndian.PutUint64(data, mark)
return data
}
for i := 0; i < 5000; i++ {
f.truncate(0)
data0 := fill(0)
f.Append(0, data0)
data1 := fill(1)
var wg sync.WaitGroup
wg.Add(2)
go func() {
f.truncate(0)
wg.Done()
}()
go func() {
f.Append(1, data1)
wg.Done()
}()
wg.Wait()
if have, err := f.Retrieve(0); err == nil {
if !bytes.Equal(have, data0) {
t.Fatalf("have %x want %x", have, data0)
}
}
}
}