Telegraf 0.1.5, update InfluxDB client to HEAD

2015-08-18 12:11:41 -06:00 · 2015-08-18 12:11:41 -06:00 · d98bedd6e1
parent 461245c83d
commit d98bedd6e1
52 changed files with 8464 additions and 3937 deletions
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@ -89,33 +89,33 @@
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/client",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/influxql",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/meta",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/snapshot",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/toml",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/influxdb/influxdb/tsdb",
-			"Comment": "v0.9.2",
-			"Rev": "6c0a91f775f9fc0e625d17ffa04a3fe86945ba09"
+			"Comment": "v0.9.1-rc1-545-g8de66eb",
+			"Rev": "8de66eb37024cd6bd953662e5588253f0888874b"
 		},
 		{
 			"ImportPath": "github.com/lib/pq",
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/README.md
@ -45,6 +45,7 @@ the configuration below.
 package main

 import "github.com/influxdb/influxdb/client"
+import "net/url"

 const (
 	MyHost        = "localhost"
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/client/influxdb.go
@ -5,22 +5,69 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"io/ioutil"
+	"net"
 	"net/http"
 	"net/url"
+	"strconv"
+	"strings"
 	"time"

 	"github.com/influxdb/influxdb/influxql"
 	"github.com/influxdb/influxdb/tsdb"
 )

+const (
+	// DefaultHost is the default host used to connect to an InfluxDB instance
+	DefaultHost = "localhost"
+
+	// DefaultPort is the default port used to connect to an InfluxDB instance
+	DefaultPort = 8086
+
+	// DefaultTimeout is the default connection timeout used to connect to an InfluxDB instance
+	DefaultTimeout = 0
+)
+
 // Query is used to send a command to the server. Both Command and Database are required.
 type Query struct {
 	Command  string
 	Database string
 }

+// ParseConnectionString will parse a string to create a valid connection URL
+func ParseConnectionString(path string, ssl bool) (url.URL, error) {
+	var host string
+	var port int
+
+	if strings.Contains(path, ":") {
+		h := strings.Split(path, ":")
+		i, e := strconv.Atoi(h[1])
+		if e != nil {
+			return url.URL{}, fmt.Errorf("invalid port number %q: %s\n", path, e)
+		}
+		port = i
+		if h[0] == "" {
+			host = DefaultHost
+		} else {
+			host = h[0]
+		}
+	} else {
+		host = path
+		// If they didn't specify a port, always use the default port
+		port = DefaultPort
+	}
+
+	u := url.URL{
+		Scheme: "http",
+	}
+	if ssl {
+		u.Scheme = "https"
+	}
+	u.Host = net.JoinHostPort(host, strconv.Itoa(port))
+
+	return u, nil
+}
+
 // Config is used to specify what server to connect to.
 // URL: The URL of the server connecting to.
 // Username/Password are optional.  They will be passed via basic auth if provided.
@ -34,6 +81,13 @@ type Config struct {
 	Timeout   time.Duration
 }

+// NewConfig will create a config to be used in connecting to the client
+func NewConfig() Config {
+	return Config{
+		Timeout: DefaultTimeout,
+	}
+}
+
 // Client is used to make calls to the server.
 type Client struct {
 	url        url.URL
@ -120,7 +174,8 @@ func (c *Client) Query(q Query) (*Response, error) {
 // If successful, error is nil and Response is nil
 // If an error occurs, Response may contain additional information if populated.
 func (c *Client) Write(bp BatchPoints) (*Response, error) {
-	c.url.Path = "write"
+	u := c.url
+	u.Path = "write"

 	var b bytes.Buffer
 	for _, p := range bp.Points {
@ -146,7 +201,7 @@ func (c *Client) Write(bp BatchPoints) (*Response, error) {
 		}
 	}

-	req, err := http.NewRequest("POST", c.url.String(), &b)
+	req, err := http.NewRequest("POST", u.String(), &b)
 	if err != nil {
 		return nil, err
 	}
@ -156,10 +211,10 @@ func (c *Client) Write(bp BatchPoints) (*Response, error) {
 		req.SetBasicAuth(c.username, c.password)
 	}
 	params := req.URL.Query()
-	params.Add("db", bp.Database)
-	params.Add("rp", bp.RetentionPolicy)
-	params.Add("precision", bp.Precision)
-	params.Add("consistency", bp.WriteConsistency)
+	params.Set("db", bp.Database)
+	params.Set("rp", bp.RetentionPolicy)
+	params.Set("precision", bp.Precision)
+	params.Set("consistency", bp.WriteConsistency)
 	req.URL.RawQuery = params.Encode()

 	resp, err := c.httpClient.Do(req)
@ -170,7 +225,7 @@ func (c *Client) Write(bp BatchPoints) (*Response, error) {

 	var response Response
 	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil && err.Error() != "EOF" {
+	if err != nil {
 		return nil, err
 	}

@ -183,6 +238,52 @@ func (c *Client) Write(bp BatchPoints) (*Response, error) {
 	return nil, nil
 }

+// WriteLineProtocol takes a string with line returns to delimit each write
+// If successful, error is nil and Response is nil
+// If an error occurs, Response may contain additional information if populated.
+func (c *Client) WriteLineProtocol(data, database, retentionPolicy, precision, writeConsistency string) (*Response, error) {
+	u := c.url
+	u.Path = "write"
+
+	r := strings.NewReader(data)
+
+	req, err := http.NewRequest("POST", u.String(), r)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("Content-Type", "")
+	req.Header.Set("User-Agent", c.userAgent)
+	if c.username != "" {
+		req.SetBasicAuth(c.username, c.password)
+	}
+	params := req.URL.Query()
+	params.Set("db", database)
+	params.Set("rp", retentionPolicy)
+	params.Set("precision", precision)
+	params.Set("consistency", writeConsistency)
+	req.URL.RawQuery = params.Encode()
+
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	var response Response
+	body, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK {
+		err := fmt.Errorf(string(body))
+		response.Err = err
+		return &response, err
+	}
+
+	return nil, nil
+}
+
 // Ping will check to see if the server is up
 // Ping returns how long the request took, the version of the server it connected to, and an error if one occurred.
 func (c *Client) Ping() (time.Duration, string, error) {
@ -209,34 +310,6 @@ func (c *Client) Ping() (time.Duration, string, error) {
 	return time.Since(now), version, nil
 }

-// Dump connects to server and retrieves all data stored for specified database.
-// If successful, Dump returns the entire response body, which is an io.ReadCloser
-func (c *Client) Dump(db string) (io.ReadCloser, error) {
-	u := c.url
-	u.Path = "dump"
-	values := u.Query()
-	values.Set("db", db)
-	u.RawQuery = values.Encode()
-
-	req, err := http.NewRequest("GET", u.String(), nil)
-	if err != nil {
-		return nil, err
-	}
-	req.Header.Set("User-Agent", c.userAgent)
-	if c.username != "" {
-		req.SetBasicAuth(c.username, c.password)
-	}
-
-	resp, err := c.httpClient.Do(req)
-	if err != nil {
-		return nil, err
-	}
-	if resp.StatusCode != http.StatusOK {
-		return resp.Body, fmt.Errorf("HTTP Protocol error %d", resp.StatusCode)
-	}
-	return resp.Body, nil
-}
-
 // Structs

 // Result represents a resultset returned from a single statement.
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/INFLUXQL.md
@ -469,14 +469,14 @@ SHOW MEASUREMENTS WHERE region = 'uswest' AND host = 'serverA';
 ### SHOW RETENTION POLICIES

 ```
-show_retention_policies = "SHOW RETENTION POLICIES" db_name .
+show_retention_policies = "SHOW RETENTION POLICIES ON" db_name .
 ```

 #### Example:

 ```sql
 -- show all retention policies on a database
-SHOW RETENTION POLICIES mydb;
+SHOW RETENTION POLICIES ON mydb;
 ```

 ### SHOW SERIES
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/ast.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/ast.go
@ -936,6 +936,11 @@ func (s *SelectStatement) walkForTime(node Node) bool {

 // HasWildcard returns whether or not the select statement has at least 1 wildcard
 func (s *SelectStatement) HasWildcard() bool {
+	return s.HasFieldWildcard() || s.HasDimensionWildcard()
+}
+
+// HasFieldWildcard returns whether or not the select statement has at least 1 wildcard in the fields
+func (s *SelectStatement) HasFieldWildcard() bool {
 	for _, f := range s.Fields {
 		_, ok := f.Expr.(*Wildcard)
 		if ok {
@ -943,6 +948,12 @@ func (s *SelectStatement) HasWildcard() bool {
 		}
 	}

+	return false
+}
+
+// HasDimensionWildcard returns whether or not the select statement has
+// at least 1 wildcard in the dimensions aka `GROUP BY`
+func (s *SelectStatement) HasDimensionWildcard() bool {
 	for _, d := range s.Dimensions {
 		_, ok := d.Expr.(*Wildcard)
 		if ok {
@ -990,11 +1001,27 @@ func (s *SelectStatement) validate(tr targetRequirement) error {
 		return err
 	}

+	if err := s.validateWildcard(); err != nil {
+		return err
+	}
+
 	return nil
 }

 func (s *SelectStatement) validateAggregates(tr targetRequirement) error {
-	// First, determine if specific calls have at least one and only one argument
+	// First, if 1 field is an aggregate, then all fields must be an aggregate. This is
+	// a explicit limitation of the current system.
+	numAggregates := 0
+	for _, f := range s.Fields {
+		if _, ok := f.Expr.(*Call); ok {
+			numAggregates++
+		}
+	}
+	if numAggregates != 0 && numAggregates != len(s.Fields) {
+		return fmt.Errorf("mixing aggregate and non-aggregate queries is not supported")
+	}
+
+	// Secondly, determine if specific calls have at least one and only one argument
 	for _, f := range s.Fields {
 		if c, ok := f.Expr.(*Call); ok {
 			switch c.Name {
@ -1033,6 +1060,13 @@ func (s *SelectStatement) validateAggregates(tr targetRequirement) error {
 	return nil
 }

+func (s *SelectStatement) validateWildcard() error {
+	if s.HasWildcard() && len(s.Fields) > 1 {
+		return fmt.Errorf("wildcards can not be combined with other fields")
+	}
+	return nil
+}
+
 func (s *SelectStatement) HasDistinct() bool {
 	// determine if we have a call named distinct
 	for _, f := range s.Fields {
@ -1321,6 +1355,17 @@ func (s *SelectStatement) NamesInSelect() []string {
 	return a
 }

+// NamesInDimension returns the field and tag names (idents) in the group by
+func (s *SelectStatement) NamesInDimension() []string {
+	var a []string
+
+	for _, d := range s.Dimensions {
+		a = append(a, walkNames(d.Expr)...)
+	}
+
+	return a
+}
+
 // walkNames will walk the Expr and return the database fields
 func walkNames(exp Expr) []string {
 	switch expr := exp.(type) {
@ -1953,6 +1998,32 @@ func (s *ShowFieldKeysStatement) RequiredPrivileges() ExecutionPrivileges {
 // Fields represents a list of fields.
 type Fields []*Field

+// AliasNames returns a list of calculated field names in
+// order of alias, function name, then field.
+func (a Fields) AliasNames() []string {
+	names := []string{}
+	for _, f := range a {
+		names = append(names, f.Name())
+	}
+	return names
+}
+
+// Names returns a list of raw field names.
+func (a Fields) Names() []string {
+	names := []string{}
+	for _, f := range a {
+		var name string
+		switch expr := f.Expr.(type) {
+		case *Call:
+			name = expr.Name
+		case *VarRef:
+			name = expr.Val
+		}
+		names = append(names, name)
+	}
+	return names
+}
+
 // String returns a string representation of the fields.
 func (a Fields) String() string {
 	var str []string
@ -1992,26 +2063,6 @@ func (f *Field) Name() string {
 func (f *Field) String() string {
 	str := f.Expr.String()

-	switch f.Expr.(type) {
-	case *VarRef:
-		quoted := false
-		// Escape any double-quotes in the field
-		if strings.Contains(str, `"`) {
-			str = strings.Replace(str, `"`, `\"`, -1)
-			quoted = true
-		}
-
-		// Escape any single-quotes in the field
-		if strings.Contains(str, `'`) {
-			quoted = true
-		}
-
-		// Double-quote field names with spaces or that were previously escaped
-		if strings.Contains(str, " ") || quoted {
-			str = fmt.Sprintf("\"%s\"", str)
-		}
-	}
-
 	if f.Alias == "" {
 		return str
 	}
@ -2132,7 +2183,9 @@ type VarRef struct {
 }

 // String returns a string representation of the variable reference.
-func (r *VarRef) String() string { return r.Val }
+func (r *VarRef) String() string {
+	return QuoteIdent(r.Val)
+}

 // Call represents a function call.
 type Call struct {
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/ast_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/ast_test.go
@ -44,35 +44,35 @@ func TestSelectStatement_Substatement(t *testing.T) {
 		{
 			stmt: `SELECT sum(aa.value) + sum(bb.value) FROM aa, bb`,
 			expr: &influxql.VarRef{Val: "aa.value"},
-			sub:  `SELECT aa.value FROM aa`,
+			sub:  `SELECT "aa.value" FROM aa`,
 		},

 		// 2. Simple merge
 		{
 			stmt: `SELECT sum(aa.value) + sum(bb.value) FROM aa, bb`,
 			expr: &influxql.VarRef{Val: "bb.value"},
-			sub:  `SELECT bb.value FROM bb`,
+			sub:  `SELECT "bb.value" FROM bb`,
 		},

 		// 3. Join with condition
 		{
 			stmt: `SELECT sum(aa.value) + sum(bb.value) FROM aa, bb WHERE aa.host = 'servera' AND bb.host = 'serverb'`,
 			expr: &influxql.VarRef{Val: "bb.value"},
-			sub:  `SELECT bb.value FROM bb WHERE bb.host = 'serverb'`,
+			sub:  `SELECT "bb.value" FROM bb WHERE "bb.host" = 'serverb'`,
 		},

 		// 4. Join with complex condition
 		{
 			stmt: `SELECT sum(aa.value) + sum(bb.value) FROM aa, bb WHERE aa.host = 'servera' AND (bb.host = 'serverb' OR bb.host = 'serverc') AND 1 = 2`,
 			expr: &influxql.VarRef{Val: "bb.value"},
-			sub:  `SELECT bb.value FROM bb WHERE (bb.host = 'serverb' OR bb.host = 'serverc') AND 1.000 = 2.000`,
+			sub:  `SELECT "bb.value" FROM bb WHERE ("bb.host" = 'serverb' OR "bb.host" = 'serverc') AND 1.000 = 2.000`,
 		},

 		// 5. 4 with different condition order
 		{
 			stmt: `SELECT sum(aa.value) + sum(bb.value) FROM aa, bb WHERE ((bb.host = 'serverb' OR bb.host = 'serverc') AND aa.host = 'servera') AND 1 = 2`,
 			expr: &influxql.VarRef{Val: "bb.value"},
-			sub:  `SELECT bb.value FROM bb WHERE ((bb.host = 'serverb' OR bb.host = 'serverc')) AND 1.000 = 2.000`,
+			sub:  `SELECT "bb.value" FROM bb WHERE (("bb.host" = 'serverb' OR "bb.host" = 'serverc')) AND 1.000 = 2.000`,
 		},
 	}

@ -217,7 +217,7 @@ func TestSelectStatement_SetTimeRange(t *testing.T) {

 // Ensure the idents from the select clause can come out
 func TestSelect_NamesInSelect(t *testing.T) {
-	s := MustParseSelectStatement("select count(asdf), bar from cpu")
+	s := MustParseSelectStatement("select count(asdf), count(bar) from cpu")
 	a := s.NamesInSelect()
 	if !reflect.DeepEqual(a, []string{"asdf", "bar"}) {
 		t.Fatal("expected names asdf and bar")
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/functions.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/functions.go
@ -550,10 +550,9 @@ func ReduceMedian(values []interface{}) interface{} {
 		sortedRange = getSortedRange(data, middle-1, 2)
 		var low, high = sortedRange[0], sortedRange[1]
 		return low + (high-low)/2
-	} else {
-		sortedRange = getSortedRange(data, middle, 1)
-		return sortedRange[0]
 	}
+	sortedRange = getSortedRange(data, middle, 1)
+	return sortedRange[0]
 }

 // getSortedRange returns a sorted subset of data. By using discardLowerRange and discardUpperRange to get the target
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/parser.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/parser.go
@ -1471,11 +1471,18 @@ func (p *Parser) parseFields() (Fields, error) {
 func (p *Parser) parseField() (*Field, error) {
 	f := &Field{}

+	_, pos, _ := p.scanIgnoreWhitespace()
+	p.unscan()
 	// Parse the expression first.
 	expr, err := p.ParseExpr()
 	if err != nil {
 		return nil, err
 	}
+	var c validateField
+	Walk(&c, expr)
+	if c.foundInvalid {
+		return nil, fmt.Errorf("invalid operator %s in SELECT clause at line %d, char %d; operator is intended for WHERE clause", c.badToken, pos.Line+1, pos.Char+1)
+	}
 	f.Expr = expr

 	// Parse the alias if the current and next tokens are "WS AS".
@ -1491,6 +1498,30 @@ func (p *Parser) parseField() (*Field, error) {
 	return f, nil
 }

+// validateField checks if the Expr is a valid field. We disallow all binary expression
+// that return a boolean
+type validateField struct {
+	foundInvalid bool
+	badToken     Token
+}
+
+func (c *validateField) Visit(n Node) Visitor {
+	e, ok := n.(*BinaryExpr)
+	if !ok {
+		return c
+	}
+
+	switch e.Op {
+	case EQ, NEQ, EQREGEX,
+		NEQREGEX, LT, LTE, GT, GTE,
+		AND, OR:
+		c.foundInvalid = true
+		c.badToken = e.Op
+		return nil
+	}
+	return c
+}
+
 // parseAlias parses the "AS (IDENT|STRING)" alias for fields and dimensions.
 func (p *Parser) parseAlias() (string, error) {
 	// Check if the next token is "AS". If not, then unscan and exit.
@ -1660,31 +1691,31 @@ func (p *Parser) parseFill() (FillOption, interface{}, error) {
 		p.unscan()
 		return NullFill, nil, nil
 	}
-	if lit, ok := expr.(*Call); !ok {
+	lit, ok := expr.(*Call)
+	if !ok {
 		p.unscan()
 		return NullFill, nil, nil
-	} else {
-		if strings.ToLower(lit.Name) != "fill" {
-			p.unscan()
-			return NullFill, nil, nil
-		}
-		if len(lit.Args) != 1 {
-			return NullFill, nil, errors.New("fill requires an argument, e.g.: 0, null, none, previous")
-		}
-		switch lit.Args[0].String() {
-		case "null":
-			return NullFill, nil, nil
-		case "none":
-			return NoFill, nil, nil
-		case "previous":
-			return PreviousFill, nil, nil
-		default:
-			num, ok := lit.Args[0].(*NumberLiteral)
-			if !ok {
-				return NullFill, nil, fmt.Errorf("expected number argument in fill()")
-			}
-			return NumberFill, num.Val, nil
+	}
+	if strings.ToLower(lit.Name) != "fill" {
+		p.unscan()
+		return NullFill, nil, nil
+	}
+	if len(lit.Args) != 1 {
+		return NullFill, nil, errors.New("fill requires an argument, e.g.: 0, null, none, previous")
+	}
+	switch lit.Args[0].String() {
+	case "null":
+		return NullFill, nil, nil
+	case "none":
+		return NoFill, nil, nil
+	case "previous":
+		return PreviousFill, nil, nil
+	default:
+		num, ok := lit.Args[0].(*NumberLiteral)
+		if !ok {
+			return NullFill, nil, fmt.Errorf("expected number argument in fill()")
 		}
+		return NumberFill, num.Val, nil
 	}
 }

@ -2186,6 +2217,11 @@ func QuoteIdent(segments ...string) string {

 // IdentNeedsQuotes returns true if the ident string given would require quotes.
 func IdentNeedsQuotes(ident string) bool {
+	// check if this identifier is a keyword
+	tok := Lookup(ident)
+	if tok != IDENT {
+		return true
+	}
 	for i, r := range ident {
 		if i == 0 && !isIdentFirstChar(r) {
 			return true
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/parser_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/parser_test.go
@ -1225,12 +1225,13 @@ func TestParser_ParseStatement(t *testing.T) {
 		{s: `SELECT field1 FROM myseries ORDER BY time, field1`, err: `only ORDER BY time ASC supported at this time`},
 		{s: `SELECT field1 AS`, err: `found EOF, expected identifier at line 1, char 18`},
 		{s: `SELECT field1 FROM foo group by time(1s)`, err: `GROUP BY requires at least one aggregate function`},
+		{s: `SELECT count(value), value FROM foo`, err: `mixing aggregate and non-aggregate queries is not supported`},
 		{s: `SELECT count(value) FROM foo group by time(1s)`, err: `aggregate functions with GROUP BY time require a WHERE time clause`},
 		{s: `SELECT count(value) FROM foo group by time(1s) where host = 'hosta.influxdb.org'`, err: `aggregate functions with GROUP BY time require a WHERE time clause`},
 		{s: `SELECT field1 FROM 12`, err: `found 12, expected identifier at line 1, char 20`},
 		{s: `SELECT 1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 FROM myseries`, err: `unable to parse number at line 1, char 8`},
 		{s: `SELECT 10.5h FROM myseries`, err: `found h, expected FROM at line 1, char 12`},
-		{s: `SELECT derivative(field1), field1 FROM myseries`, err: `derivative cannot be used with other fields`},
+		{s: `SELECT derivative(field1), field1 FROM myseries`, err: `mixing aggregate and non-aggregate queries is not supported`},
 		{s: `SELECT distinct(field1), sum(field1) FROM myseries`, err: `aggregate function distinct() can not be combined with other functions or fields`},
 		{s: `SELECT distinct(field1), field2 FROM myseries`, err: `aggregate function distinct() can not be combined with other functions or fields`},
 		{s: `SELECT distinct(field1, field2) FROM myseries`, err: `distinct function can only have one argument`},
@ -1244,6 +1245,12 @@ func TestParser_ParseStatement(t *testing.T) {
 		{s: `select derivative() from myseries`, err: `invalid number of arguments for derivative, expected at least 1 but no more than 2, got 0`},
 		{s: `select derivative(mean(value), 1h, 3) from myseries`, err: `invalid number of arguments for derivative, expected at least 1 but no more than 2, got 3`},
 		{s: `SELECT field1 from myseries WHERE host =~ 'asd' LIMIT 1`, err: `found asd, expected regex at line 1, char 42`},
+		{s: `SELECT value > 2 FROM cpu`, err: `invalid operator > in SELECT clause at line 1, char 8; operator is intended for WHERE clause`},
+		{s: `SELECT value = 2 FROM cpu`, err: `invalid operator = in SELECT clause at line 1, char 8; operator is intended for WHERE clause`},
+		{s: `SELECT s =~ /foo/ FROM cpu`, err: `invalid operator =~ in SELECT clause at line 1, char 8; operator is intended for WHERE clause`},
+		{s: `SELECT foo, * from cpu`, err: `wildcards can not be combined with other fields`},
+		{s: `SELECT *, * from cpu`, err: `found ,, expected FROM at line 1, char 9`},
+		{s: `SELECT *, foo from cpu`, err: `found ,, expected FROM at line 1, char 9`},
 		{s: `DELETE`, err: `found EOF, expected FROM at line 1, char 8`},
 		{s: `DELETE FROM`, err: `found EOF, expected identifier at line 1, char 13`},
 		{s: `DELETE FROM myseries WHERE`, err: `found EOF, expected identifier, string, number, bool at line 1, char 28`},
@ -1661,6 +1668,8 @@ func TestQuoteIdent(t *testing.T) {
 		s     string
 	}{
 		{[]string{``}, ``},
+		{[]string{`select`}, `"select"`},
+		{[]string{`in-bytes`}, `"in-bytes"`},
 		{[]string{`foo`, `bar`}, `"foo".bar`},
 		{[]string{`foo`, ``, `bar`}, `"foo"..bar`},
 		{[]string{`foo bar`, `baz`}, `"foo bar".baz`},
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/result.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/result.go
@ -166,12 +166,8 @@ func newBinaryExprEvaluator(op Token, lhs, rhs Processor) Processor {
 		return func(values []interface{}) interface{} {
 			l := lhs(values)
 			r := rhs(values)
-			if lv, ok := l.(float64); ok {
-				if rv, ok := r.(float64); ok {
-					if rv != 0 {
-						return lv + rv
-					}
-				}
+			if lf, rf, ok := processorValuesAsFloat64(l, r); ok {
+				return lf + rf
 			}
 			return nil
 		}
@ -179,12 +175,8 @@ func newBinaryExprEvaluator(op Token, lhs, rhs Processor) Processor {
 		return func(values []interface{}) interface{} {
 			l := lhs(values)
 			r := rhs(values)
-			if lv, ok := l.(float64); ok {
-				if rv, ok := r.(float64); ok {
-					if rv != 0 {
-						return lv - rv
-					}
-				}
+			if lf, rf, ok := processorValuesAsFloat64(l, r); ok {
+				return lf - rf
 			}
 			return nil
 		}
@ -192,12 +184,8 @@ func newBinaryExprEvaluator(op Token, lhs, rhs Processor) Processor {
 		return func(values []interface{}) interface{} {
 			l := lhs(values)
 			r := rhs(values)
-			if lv, ok := l.(float64); ok {
-				if rv, ok := r.(float64); ok {
-					if rv != 0 {
-						return lv * rv
-					}
-				}
+			if lf, rf, ok := processorValuesAsFloat64(l, r); ok {
+				return lf * rf
 			}
 			return nil
 		}
@ -205,12 +193,8 @@ func newBinaryExprEvaluator(op Token, lhs, rhs Processor) Processor {
 		return func(values []interface{}) interface{} {
 			l := lhs(values)
 			r := rhs(values)
-			if lv, ok := l.(float64); ok {
-				if rv, ok := r.(float64); ok {
-					if rv != 0 {
-						return lv / rv
-					}
-				}
+			if lf, rf, ok := processorValuesAsFloat64(l, r); ok {
+				return lf / rf
 			}
 			return nil
 		}
@ -221,3 +205,27 @@ func newBinaryExprEvaluator(op Token, lhs, rhs Processor) Processor {
 		}
 	}
 }
+
+func processorValuesAsFloat64(lhs interface{}, rhs interface{}) (float64, float64, bool) {
+	var lf float64
+	var rf float64
+	var ok bool
+
+	lf, ok = lhs.(float64)
+	if !ok {
+		var li int64
+		if li, ok = lhs.(int64); !ok {
+			return 0, 0, false
+		}
+		lf = float64(li)
+	}
+	rf, ok = rhs.(float64)
+	if !ok {
+		var ri int64
+		if ri, ok = rhs.(int64); !ok {
+			return 0, 0, false
+		}
+		rf = float64(ri)
+	}
+	return lf, rf, true
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/scanner.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/scanner.go
@ -514,6 +514,8 @@ func ScanString(r io.RuneScanner) (string, error) {
 				_, _ = buf.WriteRune('\\')
 			} else if ch1 == '"' {
 				_, _ = buf.WriteRune('"')
+			} else if ch1 == '\'' {
+				_, _ = buf.WriteRune('\'')
 			} else {
 				return string(ch0) + string(ch1), errBadEscape
 			}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/scanner_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/influxql/scanner_test.go
@ -243,6 +243,7 @@ func TestScanString(t *testing.T) {
 		{in: `"foo\nbar"`, out: "foo\nbar"},
 		{in: `"foo\\bar"`, out: `foo\bar`},
 		{in: `"foo\"bar"`, out: `foo"bar`},
+		{in: `'foo\'bar'`, out: `foo'bar`},

 		{in: `"foo` + "\n", out: `foo`, err: "bad string"}, // newline in string
 		{in: `"foo`, out: `foo`, err: "bad string"},        // unclosed quotes
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/config.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/config.go
@ -31,16 +31,17 @@ type Config struct {
 	Dir                 string        `toml:"dir"`
 	Hostname            string        `toml:"hostname"`
 	BindAddress         string        `toml:"bind-address"`
-	Peers               []string      `toml:"peers"`
+	Peers               []string      `toml:"-"`
 	RetentionAutoCreate bool          `toml:"retention-autocreate"`
 	ElectionTimeout     toml.Duration `toml:"election-timeout"`
 	HeartbeatTimeout    toml.Duration `toml:"heartbeat-timeout"`
 	LeaderLeaseTimeout  toml.Duration `toml:"leader-lease-timeout"`
 	CommitTimeout       toml.Duration `toml:"commit-timeout"`
+	ClusterTracing      bool          `toml:"cluster-tracing"`
 }

-func NewConfig() Config {
-	return Config{
+func NewConfig() *Config {
+	return &Config{
 		Hostname:            DefaultHostname,
 		BindAddress:         DefaultBindAddress,
 		RetentionAutoCreate: true,
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/data.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/data.go
@ -141,8 +141,8 @@ func (data *Data) CreateRetentionPolicy(database string, rpi *RetentionPolicyInf
 	// Validate retention policy.
 	if rpi.Name == "" {
 		return ErrRetentionPolicyNameRequired
-	} else if rpi.ReplicaN != len(data.Nodes) {
-		return ErrReplicationFactorMismatch
+	} else if rpi.ReplicaN < 1 {
+		return ErrReplicationFactorTooLow
 	}

 	// Find database.
@ -706,14 +706,18 @@ func (di *DatabaseInfo) unmarshal(pb *internal.DatabaseInfo) {
 	di.Name = pb.GetName()
 	di.DefaultRetentionPolicy = pb.GetDefaultRetentionPolicy()

-	di.RetentionPolicies = make([]RetentionPolicyInfo, len(pb.GetRetentionPolicies()))
-	for i, x := range pb.GetRetentionPolicies() {
-		di.RetentionPolicies[i].unmarshal(x)
+	if len(pb.GetRetentionPolicies()) > 0 {
+		di.RetentionPolicies = make([]RetentionPolicyInfo, len(pb.GetRetentionPolicies()))
+		for i, x := range pb.GetRetentionPolicies() {
+			di.RetentionPolicies[i].unmarshal(x)
+		}
 	}

-	di.ContinuousQueries = make([]ContinuousQueryInfo, len(pb.GetContinuousQueries()))
-	for i, x := range pb.GetContinuousQueries() {
-		di.ContinuousQueries[i].unmarshal(x)
+	if len(pb.GetContinuousQueries()) > 0 {
+		di.ContinuousQueries = make([]ContinuousQueryInfo, len(pb.GetContinuousQueries()))
+		for i, x := range pb.GetContinuousQueries() {
+			di.ContinuousQueries[i].unmarshal(x)
+		}
 	}
 }

@ -794,9 +798,11 @@ func (rpi *RetentionPolicyInfo) unmarshal(pb *internal.RetentionPolicyInfo) {
 	rpi.Duration = time.Duration(pb.GetDuration())
 	rpi.ShardGroupDuration = time.Duration(pb.GetShardGroupDuration())

-	rpi.ShardGroups = make([]ShardGroupInfo, len(pb.GetShardGroups()))
-	for i, x := range pb.GetShardGroups() {
-		rpi.ShardGroups[i].unmarshal(x)
+	if len(pb.GetShardGroups()) > 0 {
+		rpi.ShardGroups = make([]ShardGroupInfo, len(pb.GetShardGroups()))
+		for i, x := range pb.GetShardGroups() {
+			rpi.ShardGroups[i].unmarshal(x)
+		}
 	}
 }

@ -900,9 +906,11 @@ func (sgi *ShardGroupInfo) unmarshal(pb *internal.ShardGroupInfo) {
 	sgi.EndTime = UnmarshalTime(pb.GetEndTime())
 	sgi.DeletedAt = UnmarshalTime(pb.GetDeletedAt())

-	sgi.Shards = make([]ShardInfo, len(pb.GetShards()))
-	for i, x := range pb.GetShards() {
-		sgi.Shards[i].unmarshal(x)
+	if len(pb.GetShards()) > 0 {
+		sgi.Shards = make([]ShardInfo, len(pb.GetShards()))
+		for i, x := range pb.GetShards() {
+			sgi.Shards[i].unmarshal(x)
+		}
 	}
 }

--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/data_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/data_test.go
@ -127,14 +127,10 @@ func TestData_CreateRetentionPolicy_ErrNameRequired(t *testing.T) {
 	}
 }

-// Ensure that creating a policy with a replication factor that doesn't match
-// the number of nodes in the cluster will return an error. This is a temporary
-// restriction until v0.9.1 is released.
-func TestData_CreateRetentionPolicy_ErrReplicationFactorMismatch(t *testing.T) {
-	data := meta.Data{
-		Nodes: []meta.NodeInfo{{ID: 1}, {ID: 2}, {ID: 3}},
-	}
-	if err := data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 2}); err != meta.ErrReplicationFactorMismatch {
+// Ensure that creating a policy with a replication factor less than 1 returns an error.
+func TestData_CreateRetentionPolicy_ErrReplicationFactorTooLow(t *testing.T) {
+	data := meta.Data{Nodes: []meta.NodeInfo{{ID: 1}}}
+	if err := data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 0}); err != meta.ErrReplicationFactorTooLow {
 		t.Fatalf("unexpected error: %s", err)
 	}
 }
@ -152,10 +148,10 @@ func TestData_CreateRetentionPolicy_ErrRetentionPolicyExists(t *testing.T) {
 	var data meta.Data
 	if err := data.CreateDatabase("db0"); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
 	}
-	if err := data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != meta.ErrRetentionPolicyExists {
+	if err := data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != meta.ErrRetentionPolicyExists {
 		t.Fatalf("unexpected error: %s", err)
 	}
 }
@ -165,7 +161,7 @@ func TestData_UpdateRetentionPolicy(t *testing.T) {
 	var data meta.Data
 	if err := data.CreateDatabase("db0"); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
 	}

@ -194,7 +190,7 @@ func TestData_DropRetentionPolicy(t *testing.T) {
 	var data meta.Data
 	if err := data.CreateDatabase("db0"); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
 	}

@ -229,9 +225,9 @@ func TestData_RetentionPolicy(t *testing.T) {
 	var data meta.Data
 	if err := data.CreateDatabase("db0"); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp1"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp1", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
 	}

@ -240,6 +236,7 @@ func TestData_RetentionPolicy(t *testing.T) {
 	} else if !reflect.DeepEqual(rpi, &meta.RetentionPolicyInfo{
 		Name:               "rp0",
 		ShardGroupDuration: 604800000000000,
+		ReplicaN:           1,
 	}) {
 		t.Fatalf("unexpected value: %#v", rpi)
 	}
@ -258,7 +255,7 @@ func TestData_SetDefaultRetentionPolicy(t *testing.T) {
 	var data meta.Data
 	if err := data.CreateDatabase("db0"); err != nil {
 		t.Fatal(err)
-	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0"}); err != nil {
+	} else if err = data.CreateRetentionPolicy("db0", &meta.RetentionPolicyInfo{Name: "rp0", ReplicaN: 1}); err != nil {
 		t.Fatal(err)
 	}

--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/errors.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/errors.go
@ -58,10 +58,9 @@ var (
 	ErrRetentionPolicyDurationTooLow = errors.New(fmt.Sprintf("retention policy duration must be at least %s",
 		RetentionPolicyMinDuration))

-	// ErrReplicationFactorMismatch is returned when the replication factor
-	// does not match the number of nodes in the cluster. This is a temporary
-	// restriction until v0.9.1 is released.
-	ErrReplicationFactorMismatch = errors.New("replication factor must match cluster size; this limitation will be lifted in v0.9.1")
+	// ErrReplicationFactorTooLow is returned when the replication factor is not in an
+	// acceptable range.
+	ErrReplicationFactorTooLow = errors.New("replication factor must be greater than 0")
 )

 var (
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.pb.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.pb.go
@ -37,7 +37,14 @@ It has these top-level messages:
 	SetPrivilegeCommand
 	SetDataCommand
 	SetAdminPrivilegeCommand
+	UpdateNodeCommand
 	Response
+	ResponseHeader
+	ErrorResponse
+	FetchDataRequest
+	FetchDataResponse
+	JoinRequest
+	JoinResponse
 */
 package internal

@ -48,6 +55,42 @@ import math "math"
 var _ = proto.Marshal
 var _ = math.Inf

+type RPCType int32
+
+const (
+	RPCType_Error     RPCType = 1
+	RPCType_FetchData RPCType = 2
+	RPCType_Join      RPCType = 3
+)
+
+var RPCType_name = map[int32]string{
+	1: "Error",
+	2: "FetchData",
+	3: "Join",
+}
+var RPCType_value = map[string]int32{
+	"Error":     1,
+	"FetchData": 2,
+	"Join":      3,
+}
+
+func (x RPCType) Enum() *RPCType {
+	p := new(RPCType)
+	*p = x
+	return p
+}
+func (x RPCType) String() string {
+	return proto.EnumName(RPCType_name, int32(x))
+}
+func (x *RPCType) UnmarshalJSON(data []byte) error {
+	value, err := proto.UnmarshalJSONEnum(RPCType_value, data, "RPCType")
+	if err != nil {
+		return err
+	}
+	*x = RPCType(value)
+	return nil
+}
+
 type Command_Type int32

 const (
@ -69,6 +112,7 @@ const (
 	Command_SetPrivilegeCommand              Command_Type = 16
 	Command_SetDataCommand                   Command_Type = 17
 	Command_SetAdminPrivilegeCommand         Command_Type = 18
+	Command_UpdateNodeCommand                Command_Type = 19
 )

 var Command_Type_name = map[int32]string{
@ -90,6 +134,7 @@ var Command_Type_name = map[int32]string{
 	16: "SetPrivilegeCommand",
 	17: "SetDataCommand",
 	18: "SetAdminPrivilegeCommand",
+	19: "UpdateNodeCommand",
 }
 var Command_Type_value = map[string]int32{
 	"CreateNodeCommand":                1,
@ -110,6 +155,7 @@ var Command_Type_value = map[string]int32{
 	"SetPrivilegeCommand":              16,
 	"SetDataCommand":                   17,
 	"SetAdminPrivilegeCommand":         18,
+	"UpdateNodeCommand":                19,
 }

 func (x Command_Type) Enum() *Command_Type {
@ -1112,6 +1158,38 @@ var E_SetAdminPrivilegeCommand_Command = &proto.ExtensionDesc{
 	Tag:           "bytes,118,opt,name=command",
 }

+type UpdateNodeCommand struct {
+	ID               *uint64 `protobuf:"varint,1,req" json:"ID,omitempty"`
+	Host             *string `protobuf:"bytes,2,req" json:"Host,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *UpdateNodeCommand) Reset()         { *m = UpdateNodeCommand{} }
+func (m *UpdateNodeCommand) String() string { return proto.CompactTextString(m) }
+func (*UpdateNodeCommand) ProtoMessage()    {}
+
+func (m *UpdateNodeCommand) GetID() uint64 {
+	if m != nil && m.ID != nil {
+		return *m.ID
+	}
+	return 0
+}
+
+func (m *UpdateNodeCommand) GetHost() string {
+	if m != nil && m.Host != nil {
+		return *m.Host
+	}
+	return ""
+}
+
+var E_UpdateNodeCommand_Command = &proto.ExtensionDesc{
+	ExtendedType:  (*Command)(nil),
+	ExtensionType: (*UpdateNodeCommand)(nil),
+	Field:         119,
+	Name:          "internal.UpdateNodeCommand.command",
+	Tag:           "bytes,119,opt,name=command",
+}
+
 type Response struct {
 	OK               *bool   `protobuf:"varint,1,req" json:"OK,omitempty"`
 	Error            *string `protobuf:"bytes,2,opt" json:"Error,omitempty"`
@ -1144,7 +1222,182 @@ func (m *Response) GetIndex() uint64 {
 	return 0
 }

+type ResponseHeader struct {
+	OK               *bool   `protobuf:"varint,1,req" json:"OK,omitempty"`
+	Error            *string `protobuf:"bytes,2,opt" json:"Error,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *ResponseHeader) Reset()         { *m = ResponseHeader{} }
+func (m *ResponseHeader) String() string { return proto.CompactTextString(m) }
+func (*ResponseHeader) ProtoMessage()    {}
+
+func (m *ResponseHeader) GetOK() bool {
+	if m != nil && m.OK != nil {
+		return *m.OK
+	}
+	return false
+}
+
+func (m *ResponseHeader) GetError() string {
+	if m != nil && m.Error != nil {
+		return *m.Error
+	}
+	return ""
+}
+
+type ErrorResponse struct {
+	Header           *ResponseHeader `protobuf:"bytes,1,req" json:"Header,omitempty"`
+	XXX_unrecognized []byte          `json:"-"`
+}
+
+func (m *ErrorResponse) Reset()         { *m = ErrorResponse{} }
+func (m *ErrorResponse) String() string { return proto.CompactTextString(m) }
+func (*ErrorResponse) ProtoMessage()    {}
+
+func (m *ErrorResponse) GetHeader() *ResponseHeader {
+	if m != nil {
+		return m.Header
+	}
+	return nil
+}
+
+type FetchDataRequest struct {
+	Index            *uint64 `protobuf:"varint,1,req" json:"Index,omitempty"`
+	Term             *uint64 `protobuf:"varint,2,req" json:"Term,omitempty"`
+	Blocking         *bool   `protobuf:"varint,3,opt,def=0" json:"Blocking,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *FetchDataRequest) Reset()         { *m = FetchDataRequest{} }
+func (m *FetchDataRequest) String() string { return proto.CompactTextString(m) }
+func (*FetchDataRequest) ProtoMessage()    {}
+
+const Default_FetchDataRequest_Blocking bool = false
+
+func (m *FetchDataRequest) GetIndex() uint64 {
+	if m != nil && m.Index != nil {
+		return *m.Index
+	}
+	return 0
+}
+
+func (m *FetchDataRequest) GetTerm() uint64 {
+	if m != nil && m.Term != nil {
+		return *m.Term
+	}
+	return 0
+}
+
+func (m *FetchDataRequest) GetBlocking() bool {
+	if m != nil && m.Blocking != nil {
+		return *m.Blocking
+	}
+	return Default_FetchDataRequest_Blocking
+}
+
+type FetchDataResponse struct {
+	Header           *ResponseHeader `protobuf:"bytes,1,req" json:"Header,omitempty"`
+	Index            *uint64         `protobuf:"varint,2,req" json:"Index,omitempty"`
+	Term             *uint64         `protobuf:"varint,3,req" json:"Term,omitempty"`
+	Data             []byte          `protobuf:"bytes,4,opt" json:"Data,omitempty"`
+	XXX_unrecognized []byte          `json:"-"`
+}
+
+func (m *FetchDataResponse) Reset()         { *m = FetchDataResponse{} }
+func (m *FetchDataResponse) String() string { return proto.CompactTextString(m) }
+func (*FetchDataResponse) ProtoMessage()    {}
+
+func (m *FetchDataResponse) GetHeader() *ResponseHeader {
+	if m != nil {
+		return m.Header
+	}
+	return nil
+}
+
+func (m *FetchDataResponse) GetIndex() uint64 {
+	if m != nil && m.Index != nil {
+		return *m.Index
+	}
+	return 0
+}
+
+func (m *FetchDataResponse) GetTerm() uint64 {
+	if m != nil && m.Term != nil {
+		return *m.Term
+	}
+	return 0
+}
+
+func (m *FetchDataResponse) GetData() []byte {
+	if m != nil {
+		return m.Data
+	}
+	return nil
+}
+
+type JoinRequest struct {
+	Addr             *string `protobuf:"bytes,1,req" json:"Addr,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *JoinRequest) Reset()         { *m = JoinRequest{} }
+func (m *JoinRequest) String() string { return proto.CompactTextString(m) }
+func (*JoinRequest) ProtoMessage()    {}
+
+func (m *JoinRequest) GetAddr() string {
+	if m != nil && m.Addr != nil {
+		return *m.Addr
+	}
+	return ""
+}
+
+type JoinResponse struct {
+	Header *ResponseHeader `protobuf:"bytes,1,req" json:"Header,omitempty"`
+	// Indicates that this node should take part in the raft cluster.
+	EnableRaft *bool `protobuf:"varint,2,opt" json:"EnableRaft,omitempty"`
+	// The addresses of raft peers to use if joining as a raft member. If not joining
+	// as a raft member, these are the nodes running raft.
+	RaftNodes []string `protobuf:"bytes,3,rep" json:"RaftNodes,omitempty"`
+	// The node ID assigned to the requesting node.
+	NodeID           *uint64 `protobuf:"varint,4,opt" json:"NodeID,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *JoinResponse) Reset()         { *m = JoinResponse{} }
+func (m *JoinResponse) String() string { return proto.CompactTextString(m) }
+func (*JoinResponse) ProtoMessage()    {}
+
+func (m *JoinResponse) GetHeader() *ResponseHeader {
+	if m != nil {
+		return m.Header
+	}
+	return nil
+}
+
+func (m *JoinResponse) GetEnableRaft() bool {
+	if m != nil && m.EnableRaft != nil {
+		return *m.EnableRaft
+	}
+	return false
+}
+
+func (m *JoinResponse) GetRaftNodes() []string {
+	if m != nil {
+		return m.RaftNodes
+	}
+	return nil
+}
+
+func (m *JoinResponse) GetNodeID() uint64 {
+	if m != nil && m.NodeID != nil {
+		return *m.NodeID
+	}
+	return 0
+}
+
 func init() {
+	proto.RegisterEnum("internal.RPCType", RPCType_name, RPCType_value)
 	proto.RegisterEnum("internal.Command_Type", Command_Type_name, Command_Type_value)
 	proto.RegisterExtension(E_CreateNodeCommand_Command)
 	proto.RegisterExtension(E_DeleteNodeCommand_Command)
@ -1164,4 +1417,5 @@ func init() {
 	proto.RegisterExtension(E_SetPrivilegeCommand_Command)
 	proto.RegisterExtension(E_SetDataCommand_Command)
 	proto.RegisterExtension(E_SetAdminPrivilegeCommand_Command)
+	proto.RegisterExtension(E_UpdateNodeCommand_Command)
 }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.proto
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/internal/meta.proto
@ -98,6 +98,8 @@ message Command {
 		UpdateUserCommand                = 15;
 		SetPrivilegeCommand              = 16;
 		SetDataCommand                   = 17;
+		SetAdminPrivilegeCommand         = 18;
+		UpdateNodeCommand                = 19;
    }

    required Type type = 1;
@ -250,8 +252,69 @@ message SetAdminPrivilegeCommand {
    required bool Admin = 2;
 }

+message UpdateNodeCommand {
+    extend Command {
+        optional UpdateNodeCommand command = 119;
+    }
+    required uint64 ID = 1;
+    required string Host = 2;
+}
+
 message Response {
 	required bool OK = 1;
 	optional string Error = 2;
 	optional uint64 Index = 3;
 }
+
+
+//========================================================================
+//
+// RPC - higher-level cluster communication operations
+//
+//========================================================================
+
+enum RPCType {
+    Error = 1;
+    FetchData = 2;
+    Join = 3;
+}
+
+message ResponseHeader {
+    required bool OK = 1;
+    optional string Error = 2;
+}
+
+message ErrorResponse {
+    required ResponseHeader Header = 1;
+}
+
+message FetchDataRequest {
+    required uint64 Index = 1;
+    required uint64 Term = 2;
+    optional bool Blocking = 3 [default = false];
+}
+
+message FetchDataResponse {
+    required ResponseHeader Header = 1;
+    required uint64 Index = 2;
+    required uint64 Term = 3;
+    optional bytes Data = 4;
+}
+
+message JoinRequest {
+    required string Addr = 1;
+}
+
+message JoinResponse {
+    required ResponseHeader Header = 1;
+
+    // Indicates that this node should take part in the raft cluster.
+    optional bool EnableRaft = 2;
+
+    // The addresses of raft peers to use if joining as a raft member. If not joining
+    // as a raft member, these are the nodes running raft.
+    repeated string RaftNodes = 3;
+
+    // The node ID assigned to the requesting node.
+    optional uint64 NodeID = 4;
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/proxy.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/proxy.go
@ -0,0 +1,62 @@
+package meta
+
+import (
+	"io"
+	"net"
+)
+
+// proxy brokers a connection from src to dst
+func proxy(dst, src *net.TCPConn) error {
+	// channels to wait on the close event for each connection
+	serverClosed := make(chan struct{}, 1)
+	clientClosed := make(chan struct{}, 1)
+	errors := make(chan error, 1)
+
+	go broker(dst, src, clientClosed, errors)
+	go broker(src, dst, serverClosed, errors)
+
+	// wait for one half of the proxy to exit, then trigger a shutdown of the
+	// other half by calling CloseRead(). This will break the read loop in the
+	// broker and allow us to fully close the connection cleanly without a
+	// "use of closed network connection" error.
+	var waitFor chan struct{}
+	select {
+	case <-clientClosed:
+		// the client closed first and any more packets from the server aren't
+		// useful, so we can optionally SetLinger(0) here to recycle the port
+		// faster.
+		dst.SetLinger(0)
+		dst.CloseRead()
+		waitFor = serverClosed
+	case <-serverClosed:
+		src.CloseRead()
+		waitFor = clientClosed
+	case err := <-errors:
+		src.CloseRead()
+		dst.SetLinger(0)
+		dst.CloseRead()
+		return err
+	}
+
+	// Wait for the other connection to close.
+	<-waitFor
+	return nil
+}
+
+// This does the actual data transfer.
+// The broker only closes the Read side.
+func broker(dst, src net.Conn, srcClosed chan struct{}, errors chan error) {
+	// We can handle errors in a finer-grained manner by inlining io.Copy (it's
+	// simple, and we drop the ReaderFrom or WriterTo checks for
+	// net.Conn->net.Conn transfers, which aren't needed). This would also let
+	// us adjust buffersize.
+	_, err := io.Copy(dst, src)
+
+	if err != nil {
+		errors <- err
+	}
+	if err := src.Close(); err != nil {
+		errors <- err
+	}
+	srcClosed <- struct{}{}
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/rpc.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/rpc.go
@ -0,0 +1,460 @@
+package meta
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"time"
+
+	"github.com/gogo/protobuf/proto"
+	"github.com/hashicorp/raft"
+	"github.com/influxdb/influxdb/meta/internal"
+)
+
+// Max size of a message before we treat the size as invalid
+const (
+	MaxMessageSize    = 1024 * 1024 * 1024
+	leaderDialTimeout = 10 * time.Second
+)
+
+// rpc handles request/response style messaging between cluster nodes
+type rpc struct {
+	logger         *log.Logger
+	tracingEnabled bool
+
+	store interface {
+		cachedData() *Data
+		IsLeader() bool
+		Leader() string
+		Peers() ([]string, error)
+		AddPeer(host string) error
+		CreateNode(host string) (*NodeInfo, error)
+		NodeByHost(host string) (*NodeInfo, error)
+		WaitForDataChanged() error
+	}
+}
+
+type JoinResult struct {
+	RaftEnabled bool
+	RaftNodes   []string
+	NodeID      uint64
+}
+
+type Reply interface {
+	GetHeader() *internal.ResponseHeader
+}
+
+// proxyLeader proxies the connection to the current raft leader
+func (r *rpc) proxyLeader(conn *net.TCPConn) {
+	if r.store.Leader() == "" {
+		r.sendError(conn, "no leader")
+		return
+	}
+
+	leaderConn, err := net.DialTimeout("tcp", r.store.Leader(), leaderDialTimeout)
+	if err != nil {
+		r.sendError(conn, fmt.Sprintf("dial leader: %v", err))
+		return
+	}
+	defer leaderConn.Close()
+
+	leaderConn.Write([]byte{MuxRPCHeader})
+	if err := proxy(leaderConn.(*net.TCPConn), conn); err != nil {
+		r.sendError(conn, fmt.Sprintf("leader proxy error: %v", err))
+	}
+}
+
+// handleRPCConn reads a command from the connection and executes it.
+func (r *rpc) handleRPCConn(conn net.Conn) {
+	defer conn.Close()
+	// RPC connections should execute on the leader.  If we are not the leader,
+	// proxy the connection to the leader so that clients an connect to any node
+	// in the cluster.
+	r.traceCluster("rpc connection from: %v", conn.RemoteAddr())
+
+	if !r.store.IsLeader() {
+		r.proxyLeader(conn.(*net.TCPConn))
+		return
+	}
+
+	// Read and execute request.
+	typ, resp, err := func() (internal.RPCType, proto.Message, error) {
+		// Read request size.
+		var sz uint64
+		if err := binary.Read(conn, binary.BigEndian, &sz); err != nil {
+			return internal.RPCType_Error, nil, fmt.Errorf("read size: %s", err)
+		}
+
+		if sz == 0 {
+			return 0, nil, fmt.Errorf("invalid message size: %d", sz)
+		}
+
+		if sz >= MaxMessageSize {
+			return 0, nil, fmt.Errorf("max message size of %d exceeded: %d", MaxMessageSize, sz)
+		}
+
+		// Read request.
+		buf := make([]byte, sz)
+		if _, err := io.ReadFull(conn, buf); err != nil {
+			return internal.RPCType_Error, nil, fmt.Errorf("read request: %s", err)
+		}
+
+		// Determine the RPC type
+		rpcType := internal.RPCType(btou64(buf[0:8]))
+		buf = buf[8:]
+
+		r.traceCluster("recv %v request on: %v", rpcType, conn.RemoteAddr())
+		switch rpcType {
+		case internal.RPCType_FetchData:
+			var req internal.FetchDataRequest
+			if err := proto.Unmarshal(buf, &req); err != nil {
+				return internal.RPCType_Error, nil, fmt.Errorf("fetch request unmarshal: %v", err)
+			}
+			resp, err := r.handleFetchData(&req)
+			return rpcType, resp, err
+		case internal.RPCType_Join:
+			var req internal.JoinRequest
+			if err := proto.Unmarshal(buf, &req); err != nil {
+				return internal.RPCType_Error, nil, fmt.Errorf("join request unmarshal: %v", err)
+			}
+			resp, err := r.handleJoinRequest(&req)
+			return rpcType, resp, err
+		default:
+			return internal.RPCType_Error, nil, fmt.Errorf("unknown rpc type:%v", rpcType)
+		}
+	}()
+
+	// Handle unexpected RPC errors
+	if err != nil {
+		resp = &internal.ErrorResponse{
+			Header: &internal.ResponseHeader{
+				OK: proto.Bool(false),
+			},
+		}
+		typ = internal.RPCType_Error
+	}
+
+	// Set the status header and error message
+	if reply, ok := resp.(Reply); ok {
+		reply.GetHeader().OK = proto.Bool(err == nil)
+		if err != nil {
+			reply.GetHeader().Error = proto.String(err.Error())
+		}
+	}
+
+	r.sendResponse(conn, typ, resp)
+}
+
+func (r *rpc) sendResponse(conn net.Conn, typ internal.RPCType, resp proto.Message) {
+	// Marshal the response back to a protobuf
+	buf, err := proto.Marshal(resp)
+	if err != nil {
+		r.logger.Printf("unable to marshal response: %v", err)
+		return
+	}
+
+	// Encode response back to connection.
+	if _, err := conn.Write(r.pack(typ, buf)); err != nil {
+		r.logger.Printf("unable to write rpc response: %s", err)
+	}
+}
+
+func (r *rpc) sendError(conn net.Conn, msg string) {
+	r.traceCluster(msg)
+	resp := &internal.ErrorResponse{
+		Header: &internal.ResponseHeader{
+			OK:    proto.Bool(false),
+			Error: proto.String(msg),
+		},
+	}
+
+	r.sendResponse(conn, internal.RPCType_Error, resp)
+}
+
+// handleFetchData handles a request for the current nodes meta data
+func (r *rpc) handleFetchData(req *internal.FetchDataRequest) (*internal.FetchDataResponse, error) {
+	var (
+		b    []byte
+		data *Data
+		err  error
+	)
+
+	for {
+		data = r.store.cachedData()
+		if data.Index != req.GetIndex() {
+			b, err = data.MarshalBinary()
+			if err != nil {
+				return nil, err
+			}
+			break
+		}
+
+		if !req.GetBlocking() {
+			break
+		}
+
+		if err := r.store.WaitForDataChanged(); err != nil {
+			return nil, err
+		}
+	}
+
+	return &internal.FetchDataResponse{
+		Header: &internal.ResponseHeader{
+			OK: proto.Bool(true),
+		},
+		Index: proto.Uint64(data.Index),
+		Term:  proto.Uint64(data.Term),
+		Data:  b}, nil
+}
+
+// handleJoinRequest handles a request to join the cluster
+func (r *rpc) handleJoinRequest(req *internal.JoinRequest) (*internal.JoinResponse, error) {
+	r.traceCluster("join request from: %v", *req.Addr)
+
+	node, err := func() (*NodeInfo, error) {
+
+		// attempt to create the node
+		node, err := r.store.CreateNode(*req.Addr)
+		// if it exists, return the existing node
+		if err == ErrNodeExists {
+			node, err = r.store.NodeByHost(*req.Addr)
+			if err != nil {
+				return node, err
+			}
+			r.logger.Printf("existing node re-joined: id=%v addr=%v", node.ID, node.Host)
+		} else if err != nil {
+			return nil, fmt.Errorf("create node: %v", err)
+		}
+
+		peers, err := r.store.Peers()
+		if err != nil {
+			return nil, fmt.Errorf("list peers: %v", err)
+		}
+
+		// If we have less than 3 nodes, add them as raft peers if they are not
+		// already a peer
+		if len(peers) < MaxRaftNodes && !raft.PeerContained(peers, *req.Addr) {
+			r.logger.Printf("adding new raft peer: nodeId=%v addr=%v", node.ID, *req.Addr)
+			if err = r.store.AddPeer(*req.Addr); err != nil {
+				return node, fmt.Errorf("add peer: %v", err)
+			}
+		}
+		return node, err
+	}()
+
+	nodeID := uint64(0)
+	if node != nil {
+		nodeID = node.ID
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	// get the current raft peers
+	peers, err := r.store.Peers()
+	if err != nil {
+		return nil, fmt.Errorf("list peers: %v", err)
+	}
+
+	return &internal.JoinResponse{
+		Header: &internal.ResponseHeader{
+			OK: proto.Bool(true),
+		},
+		EnableRaft: proto.Bool(raft.PeerContained(peers, *req.Addr)),
+		RaftNodes:  peers,
+		NodeID:     proto.Uint64(nodeID),
+	}, err
+
+}
+
+// pack returns a TLV style byte slice encoding the size of the payload, the RPC type
+// and the RPC data
+func (r *rpc) pack(typ internal.RPCType, b []byte) []byte {
+	buf := u64tob(uint64(len(b)) + 8)
+	buf = append(buf, u64tob(uint64(typ))...)
+	buf = append(buf, b...)
+	return buf
+}
+
+// fetchMetaData returns the latest copy of the meta store data from the current
+// leader.
+func (r *rpc) fetchMetaData(blocking bool) (*Data, error) {
+	assert(r.store != nil, "store is nil")
+
+	// Retrieve the current known leader.
+	leader := r.store.Leader()
+	if leader == "" {
+		return nil, errors.New("no leader")
+	}
+
+	var index, term uint64
+	data := r.store.cachedData()
+	if data != nil {
+		index = data.Index
+		term = data.Index
+	}
+	resp, err := r.call(leader, &internal.FetchDataRequest{
+		Index:    proto.Uint64(index),
+		Term:     proto.Uint64(term),
+		Blocking: proto.Bool(blocking),
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	switch t := resp.(type) {
+	case *internal.FetchDataResponse:
+		// If data is nil, then the term and index we sent matches the leader
+		if t.GetData() == nil {
+			return nil, nil
+		}
+		ms := &Data{}
+		if err := ms.UnmarshalBinary(t.GetData()); err != nil {
+			return nil, fmt.Errorf("rpc unmarshal metadata: %v", err)
+		}
+		return ms, nil
+	case *internal.ErrorResponse:
+		return nil, fmt.Errorf("rpc failed: %s", t.GetHeader().GetError())
+	default:
+		return nil, fmt.Errorf("rpc failed: unknown response type: %v", t.String())
+	}
+}
+
+// join attempts to join a cluster at remoteAddr using localAddr as the current
+// node's cluster address
+func (r *rpc) join(localAddr, remoteAddr string) (*JoinResult, error) {
+	req := &internal.JoinRequest{
+		Addr: proto.String(localAddr),
+	}
+
+	resp, err := r.call(remoteAddr, req)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t := resp.(type) {
+	case *internal.JoinResponse:
+		return &JoinResult{
+			RaftEnabled: t.GetEnableRaft(),
+			RaftNodes:   t.GetRaftNodes(),
+			NodeID:      t.GetNodeID(),
+		}, nil
+	case *internal.ErrorResponse:
+		return nil, fmt.Errorf("rpc failed: %s", t.GetHeader().GetError())
+	default:
+		return nil, fmt.Errorf("rpc failed: unknown response type: %v", t.String())
+	}
+}
+
+// call sends an encoded request to the remote leader and returns
+// an encoded response value.
+func (r *rpc) call(dest string, req proto.Message) (proto.Message, error) {
+	// Determine type of request
+	var rpcType internal.RPCType
+	switch t := req.(type) {
+	case *internal.JoinRequest:
+		rpcType = internal.RPCType_Join
+	case *internal.FetchDataRequest:
+		rpcType = internal.RPCType_FetchData
+	default:
+		return nil, fmt.Errorf("unknown rpc request type: %v", t)
+	}
+
+	// Create a connection to the leader.
+	conn, err := net.DialTimeout("tcp", dest, leaderDialTimeout)
+	if err != nil {
+		return nil, fmt.Errorf("rpc dial: %v", err)
+	}
+	defer conn.Close()
+
+	// Write a marker byte for rpc messages.
+	_, err = conn.Write([]byte{MuxRPCHeader})
+	if err != nil {
+		return nil, err
+	}
+
+	b, err := proto.Marshal(req)
+	if err != nil {
+		return nil, fmt.Errorf("rpc marshal: %v", err)
+	}
+
+	// Write request size & bytes.
+	if _, err := conn.Write(r.pack(rpcType, b)); err != nil {
+		return nil, fmt.Errorf("write %v rpc: %s", rpcType, err)
+	}
+
+	data, err := ioutil.ReadAll(conn)
+	if err != nil {
+		return nil, fmt.Errorf("read %v rpc: %v", rpcType, err)
+	}
+
+	// Should always have a size and type
+	if exp := 16; len(data) < exp {
+		r.traceCluster("recv: %v", string(data))
+		return nil, fmt.Errorf("rpc %v failed: short read: got %v, exp %v", rpcType, len(data), exp)
+	}
+
+	sz := btou64(data[0:8])
+	if len(data[8:]) != int(sz) {
+		r.traceCluster("recv: %v", string(data))
+		return nil, fmt.Errorf("rpc %v failed: short read: got %v, exp %v", rpcType, len(data[8:]), sz)
+	}
+
+	// See what response type we got back, could get a general error response
+	rpcType = internal.RPCType(btou64(data[8:16]))
+	data = data[16:]
+
+	var resp proto.Message
+	switch rpcType {
+	case internal.RPCType_Join:
+		resp = &internal.JoinResponse{}
+	case internal.RPCType_FetchData:
+		resp = &internal.FetchDataResponse{}
+	case internal.RPCType_Error:
+		resp = &internal.ErrorResponse{}
+	default:
+		return nil, fmt.Errorf("unknown rpc response type: %v", rpcType)
+	}
+
+	if err := proto.Unmarshal(data, resp); err != nil {
+		return nil, fmt.Errorf("rpc unmarshal: %v", err)
+	}
+
+	if reply, ok := resp.(Reply); ok {
+		if !reply.GetHeader().GetOK() {
+			return nil, fmt.Errorf("rpc %v failed: %s", rpcType, reply.GetHeader().GetError())
+		}
+	}
+
+	return resp, nil
+}
+
+func (r *rpc) traceCluster(msg string, args ...interface{}) {
+	if r.tracingEnabled {
+		r.logger.Printf("rpc: "+msg, args...)
+	}
+}
+
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+func btou64(b []byte) uint64 {
+	return binary.BigEndian.Uint64(b)
+}
+
+func contains(s []string, e string) bool {
+	for _, a := range s {
+		if a == e {
+			return true
+		}
+	}
+	return false
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/rpc_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/rpc_test.go
@ -0,0 +1,242 @@
+package meta
+
+import (
+	"net"
+	"sync"
+	"testing"
+)
+
+func TestRPCFetchData(t *testing.T) {
+
+	serverRPC := &rpc{
+		store: &fakeStore{
+			md: &Data{Index: 99},
+		},
+	}
+
+	srv := newTestServer(t, serverRPC)
+	defer srv.Close()
+	go srv.Serve()
+
+	// Wait for the RPC server to be ready
+	<-srv.Ready
+
+	// create a new RPC with no existing meta.Data cache
+	clientRPC := &rpc{
+		store: &fakeStore{
+			leader: srv.Listener.Addr().String(),
+		},
+	}
+
+	// fetch the servers meta-data
+	md, err := clientRPC.fetchMetaData(false)
+	if err != nil {
+		t.Fatalf("failed to fetchMetaData: %v", err)
+	}
+
+	if md == nil {
+		t.Fatalf("meta-data is nil")
+	}
+
+	if exp := uint64(99); md.Index != exp {
+		t.Fatalf("meta-data mismatch. got %v, exp %v", md.Index, exp)
+	}
+}
+
+func TestRPCFetchDataMatchesLeader(t *testing.T) {
+	serverRPC := &rpc{
+		store: &fakeStore{
+			md: &Data{Index: 99},
+		},
+	}
+
+	srv := newTestServer(t, serverRPC)
+	defer srv.Close()
+	go srv.Serve()
+
+	// Wait for the RPC server to be ready
+	<-srv.Ready
+
+	// create a new RPC with a matching index as the server
+	clientRPC := &rpc{
+		store: &fakeStore{
+			leader: srv.Listener.Addr().String(),
+			md:     &Data{Index: 99},
+		},
+	}
+
+	// fetch the servers meta-data
+	md, err := clientRPC.fetchMetaData(false)
+	if err != nil {
+		t.Fatalf("failed to fetchMetaData: %v", err)
+	}
+
+	if md != nil {
+		t.Fatalf("meta-data is not nil")
+	}
+}
+
+func TestRPCFetchDataMatchesBlocking(t *testing.T) {
+	fs := &fakeStore{
+		md:        &Data{Index: 99},
+		blockChan: make(chan struct{}),
+	}
+	serverRPC := &rpc{
+		store: fs,
+	}
+
+	srv := newTestServer(t, serverRPC)
+	defer srv.Close()
+	go srv.Serve()
+
+	// Wait for the RPC server to be ready
+	<-srv.Ready
+
+	// create a new RPC with a matching index as the server
+	clientRPC := &rpc{
+		store: &fakeStore{
+			leader: srv.Listener.Addr().String(),
+			md:     &Data{Index: 99},
+		},
+	}
+
+	// Kick off the fetching block
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		// fetch the servers meta-data
+		md, err := clientRPC.fetchMetaData(true)
+		if err != nil {
+			t.Fatalf("failed to fetchMetaData: %v", err)
+		}
+
+		if md == nil {
+			t.Fatalf("meta-data is nil")
+		}
+
+		if exp := uint64(100); md.Index != exp {
+			t.Fatalf("meta-data mismatch. got %v, exp %v", md.Index, exp)
+		}
+	}()
+
+	// Simulate the rmote index changing and unblocking
+	fs.mu.Lock()
+	fs.md.Index = 100
+	fs.mu.Unlock()
+	close(fs.blockChan)
+	wg.Wait()
+}
+
+func TestRPCJoin(t *testing.T) {
+	fs := &fakeStore{
+		leader:    "1.2.3.4:1234",
+		md:        &Data{Index: 99},
+		newNodeID: uint64(100),
+		blockChan: make(chan struct{}),
+	}
+	serverRPC := &rpc{
+		store: fs,
+	}
+
+	srv := newTestServer(t, serverRPC)
+	defer srv.Close()
+	go srv.Serve()
+
+	// Wait for the RPC server to be ready
+	<-srv.Ready
+
+	// create a new RPC with a matching index as the server
+	clientRPC := &rpc{
+		store: &fakeStore{
+			leader: srv.Listener.Addr().String(),
+			md:     &Data{Index: 99},
+		},
+	}
+
+	res, err := clientRPC.join("1.2.3.4:1234", srv.Listener.Addr().String())
+	if err != nil {
+		t.Fatalf("failed to join: %v", err)
+	}
+
+	if exp := true; res.RaftEnabled != true {
+		t.Fatalf("raft enabled mismatch: got %v, exp %v", res.RaftEnabled, exp)
+	}
+
+	if exp := 1; len(res.RaftNodes) != exp {
+		t.Fatalf("raft peer mismatch: got %v, exp %v", len(res.RaftNodes), exp)
+	}
+
+	if exp := "1.2.3.4:1234"; res.RaftNodes[0] != exp {
+		t.Fatalf("raft peer mismatch: got %v, exp %v", res.RaftNodes[0], exp)
+	}
+
+	if exp := uint64(100); res.NodeID != exp {
+		t.Fatalf("node id mismatch. got %v, exp %v", res.NodeID, exp)
+	}
+}
+
+type fakeStore struct {
+	mu        sync.RWMutex
+	leader    string
+	newNodeID uint64
+	md        *Data
+	blockChan chan struct{}
+}
+
+type testServer struct {
+	Listener net.Listener
+	Ready    chan struct{}
+	rpc      *rpc
+	t        *testing.T
+}
+
+func newTestServer(t *testing.T, rpc *rpc) *testServer {
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to listen: %v", err)
+	}
+	return &testServer{
+		Listener: ln,
+		Ready:    make(chan struct{}),
+		rpc:      rpc,
+	}
+}
+
+func (s *testServer) Close() {
+	s.Listener.Close()
+}
+
+func (s *testServer) Serve() {
+	close(s.Ready)
+	conn, err := s.Listener.Accept()
+	if err != nil {
+		s.t.Fatalf("failed to accept: %v", err)
+	}
+
+	// Demux...
+	b := make([]byte, 1)
+	if _, err := conn.Read(b); err != nil {
+		s.t.Fatalf("failed to demux: %v", err)
+	}
+	s.rpc.handleRPCConn(conn)
+}
+
+func (f *fakeStore) cachedData() *Data {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.md
+}
+
+func (f *fakeStore) IsLeader() bool            { return true }
+func (f *fakeStore) Leader() string            { return f.leader }
+func (f *fakeStore) Peers() ([]string, error)  { return []string{f.leader}, nil }
+func (f *fakeStore) AddPeer(host string) error { return nil }
+func (f *fakeStore) CreateNode(host string) (*NodeInfo, error) {
+	return &NodeInfo{ID: f.newNodeID, Host: host}, nil
+}
+func (f *fakeStore) NodeByHost(host string) (*NodeInfo, error) { return nil, nil }
+func (f *fakeStore) WaitForDataChanged() error {
+	<-f.blockChan
+	return nil
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/state.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/state.go
@ -0,0 +1,489 @@
+package meta
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	"github.com/hashicorp/raft"
+	"github.com/hashicorp/raft-boltdb"
+)
+
+// raftState abstracts the interaction of the raft consensus layer
+// across local or remote nodes.  It is a form of the state design pattern and allows
+// the meta.Store to change its behavior with the raft layer at runtime.
+type raftState interface {
+	open() error
+	remove() error
+	initialize() error
+	leader() string
+	isLeader() bool
+	sync(index uint64, timeout time.Duration) error
+	setPeers(addrs []string) error
+	addPeer(addr string) error
+	peers() ([]string, error)
+	invalidate() error
+	close() error
+	lastIndex() uint64
+	apply(b []byte) error
+	snapshot() error
+}
+
+// localRaft is a consensus strategy that uses a local raft implementation for
+// consensus operations.
+type localRaft struct {
+	wg        sync.WaitGroup
+	closing   chan struct{}
+	store     *Store
+	raft      *raft.Raft
+	transport *raft.NetworkTransport
+	peerStore raft.PeerStore
+	raftStore *raftboltdb.BoltStore
+	raftLayer *raftLayer
+}
+
+func (r *localRaft) remove() error {
+	if err := os.RemoveAll(filepath.Join(r.store.path, "raft.db")); err != nil {
+		return err
+	}
+	if err := os.RemoveAll(filepath.Join(r.store.path, "peers.json")); err != nil {
+		return err
+	}
+	if err := os.RemoveAll(filepath.Join(r.store.path, "snapshots")); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (r *localRaft) updateMetaData(ms *Data) {
+	if ms == nil {
+		return
+	}
+
+	updated := false
+	r.store.mu.RLock()
+	if ms.Index > r.store.data.Index {
+		updated = true
+	}
+	r.store.mu.RUnlock()
+
+	if updated {
+		r.store.Logger.Printf("Updating metastore to term=%v index=%v", ms.Term, ms.Index)
+		r.store.mu.Lock()
+		r.store.data = ms
+		r.store.mu.Unlock()
+	}
+}
+
+func (r *localRaft) invalidate() error {
+	if r.store.IsLeader() {
+		return nil
+	}
+
+	ms, err := r.store.rpc.fetchMetaData(false)
+	if err != nil {
+		return err
+	}
+
+	r.updateMetaData(ms)
+	return nil
+}
+
+func (r *localRaft) open() error {
+	r.closing = make(chan struct{})
+
+	s := r.store
+	// Setup raft configuration.
+	config := raft.DefaultConfig()
+	config.LogOutput = ioutil.Discard
+
+	if s.clusterTracingEnabled {
+		config.Logger = s.Logger
+	}
+	config.HeartbeatTimeout = s.HeartbeatTimeout
+	config.ElectionTimeout = s.ElectionTimeout
+	config.LeaderLeaseTimeout = s.LeaderLeaseTimeout
+	config.CommitTimeout = s.CommitTimeout
+
+	// If no peers are set in the config or there is one and we are it, then start as a single server.
+	if len(s.peers) <= 1 {
+		config.EnableSingleNode = true
+		// Ensure we can always become the leader
+		config.DisableBootstrapAfterElect = false
+		// Don't shutdown raft automatically if we renamed our hostname back to a previous name
+		config.ShutdownOnRemove = false
+	}
+
+	// Build raft layer to multiplex listener.
+	r.raftLayer = newRaftLayer(s.RaftListener, s.RemoteAddr)
+
+	// Create a transport layer
+	r.transport = raft.NewNetworkTransport(r.raftLayer, 3, 10*time.Second, config.LogOutput)
+
+	// Create peer storage.
+	r.peerStore = raft.NewJSONPeers(s.path, r.transport)
+
+	peers, err := r.peerStore.Peers()
+	if err != nil {
+		return err
+	}
+
+	// For single-node clusters, we can update the raft peers before we start the cluster if the hostname
+	// has changed.
+	if config.EnableSingleNode {
+		if err := r.peerStore.SetPeers([]string{s.RemoteAddr.String()}); err != nil {
+			return err
+		}
+		peers = []string{s.RemoteAddr.String()}
+	}
+
+	// If we have multiple nodes in the cluster, make sure our address is in the raft peers or
+	// we won't be able to boot into the cluster because the other peers will reject our new hostname.  This
+	// is difficult to resolve automatically because we need to have all the raft peers agree on the current members
+	// of the cluster before we can change them.
+	if len(peers) > 0 && !raft.PeerContained(peers, s.RemoteAddr.String()) {
+		s.Logger.Printf("%v is not in the list of raft peers. Please update %v/peers.json on all raft nodes to have the same contents.", s.RemoteAddr.String(), s.Path())
+		return fmt.Errorf("peers out of sync: %v not in %v", s.RemoteAddr.String(), peers)
+	}
+
+	// Create the log store and stable store.
+	store, err := raftboltdb.NewBoltStore(filepath.Join(s.path, "raft.db"))
+	if err != nil {
+		return fmt.Errorf("new bolt store: %s", err)
+	}
+	r.raftStore = store
+
+	// Create the snapshot store.
+	snapshots, err := raft.NewFileSnapshotStore(s.path, raftSnapshotsRetained, os.Stderr)
+	if err != nil {
+		return fmt.Errorf("file snapshot store: %s", err)
+	}
+
+	// Create raft log.
+	ra, err := raft.NewRaft(config, (*storeFSM)(s), store, store, snapshots, r.peerStore, r.transport)
+	if err != nil {
+		return fmt.Errorf("new raft: %s", err)
+	}
+	r.raft = ra
+
+	r.wg.Add(1)
+	go r.logLeaderChanges()
+
+	return nil
+}
+
+func (r *localRaft) logLeaderChanges() {
+	defer r.wg.Done()
+	// Logs our current state (Node at 1.2.3.4:8088 [Follower])
+	r.store.Logger.Printf(r.raft.String())
+	for {
+		select {
+		case <-r.closing:
+			return
+		case <-r.raft.LeaderCh():
+			peers, err := r.peers()
+			if err != nil {
+				r.store.Logger.Printf("failed to lookup peers: %v", err)
+			}
+			r.store.Logger.Printf("%v. peers=%v", r.raft.String(), peers)
+		}
+	}
+}
+
+func (r *localRaft) close() error {
+	close(r.closing)
+	r.wg.Wait()
+
+	if r.transport != nil {
+		r.transport.Close()
+		r.transport = nil
+	}
+
+	if r.raftLayer != nil {
+		r.raftLayer.Close()
+		r.raftLayer = nil
+	}
+
+	// Shutdown raft.
+	if r.raft != nil {
+		if err := r.raft.Shutdown().Error(); err != nil {
+			return err
+		}
+		r.raft = nil
+	}
+
+	if r.raftStore != nil {
+		r.raftStore.Close()
+		r.raftStore = nil
+	}
+
+	return nil
+}
+
+func (r *localRaft) initialize() error {
+	s := r.store
+	// If we have committed entries then the store is already in the cluster.
+	if index, err := r.raftStore.LastIndex(); err != nil {
+		return fmt.Errorf("last index: %s", err)
+	} else if index > 0 {
+		return nil
+	}
+
+	// Force set peers.
+	if err := r.setPeers(s.peers); err != nil {
+		return fmt.Errorf("set raft peers: %s", err)
+	}
+
+	return nil
+}
+
+// apply applies a serialized command to the raft log.
+func (r *localRaft) apply(b []byte) error {
+	// Apply to raft log.
+	f := r.raft.Apply(b, 0)
+	if err := f.Error(); err != nil {
+		return err
+	}
+
+	// Return response if it's an error.
+	// No other non-nil objects should be returned.
+	resp := f.Response()
+	if err, ok := resp.(error); ok {
+		return lookupError(err)
+	}
+	assert(resp == nil, "unexpected response: %#v", resp)
+
+	return nil
+}
+
+func (r *localRaft) lastIndex() uint64 {
+	return r.raft.LastIndex()
+}
+
+func (r *localRaft) sync(index uint64, timeout time.Duration) error {
+	ticker := time.NewTicker(100 * time.Millisecond)
+	defer ticker.Stop()
+
+	timer := time.NewTimer(timeout)
+	defer timer.Stop()
+
+	for {
+		// Wait for next tick or timeout.
+		select {
+		case <-ticker.C:
+		case <-timer.C:
+			return errors.New("timeout")
+		}
+
+		// Compare index against current metadata.
+		r.store.mu.Lock()
+		ok := (r.store.data.Index >= index)
+		r.store.mu.Unlock()
+
+		// Exit if we are at least at the given index.
+		if ok {
+			return nil
+		}
+	}
+}
+
+func (r *localRaft) snapshot() error {
+	future := r.raft.Snapshot()
+	return future.Error()
+}
+
+// addPeer adds addr to the list of peers in the cluster.
+func (r *localRaft) addPeer(addr string) error {
+	peers, err := r.peerStore.Peers()
+	if err != nil {
+		return err
+	}
+
+	if len(peers) >= 3 {
+		return nil
+	}
+
+	if fut := r.raft.AddPeer(addr); fut.Error() != nil {
+		return fut.Error()
+	}
+	return nil
+}
+
+// setPeers sets a list of peers in the cluster.
+func (r *localRaft) setPeers(addrs []string) error {
+	return r.raft.SetPeers(addrs).Error()
+}
+
+func (r *localRaft) peers() ([]string, error) {
+	return r.peerStore.Peers()
+}
+
+func (r *localRaft) leader() string {
+	if r.raft == nil {
+		return ""
+	}
+
+	return r.raft.Leader()
+}
+
+func (r *localRaft) isLeader() bool {
+	r.store.mu.RLock()
+	defer r.store.mu.RUnlock()
+	if r.raft == nil {
+		return false
+	}
+	return r.raft.State() == raft.Leader
+}
+
+// remoteRaft is a consensus strategy that uses a remote raft cluster for
+// consensus operations.
+type remoteRaft struct {
+	store *Store
+}
+
+func (r *remoteRaft) remove() error {
+	return nil
+}
+
+func (r *remoteRaft) updateMetaData(ms *Data) {
+	if ms == nil {
+		return
+	}
+
+	updated := false
+	r.store.mu.RLock()
+	if ms.Index > r.store.data.Index {
+		updated = true
+	}
+	r.store.mu.RUnlock()
+
+	if updated {
+		r.store.Logger.Printf("Updating metastore to term=%v index=%v", ms.Term, ms.Index)
+		r.store.mu.Lock()
+		r.store.data = ms
+		r.store.mu.Unlock()
+	}
+}
+
+func (r *remoteRaft) invalidate() error {
+	ms, err := r.store.rpc.fetchMetaData(false)
+	if err != nil {
+		return err
+	}
+
+	r.updateMetaData(ms)
+	return nil
+}
+
+func (r *remoteRaft) setPeers(addrs []string) error {
+	// Convert to JSON
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+	if err := enc.Encode(addrs); err != nil {
+		return err
+	}
+
+	// Write out as JSON
+	return ioutil.WriteFile(filepath.Join(r.store.path, "peers.json"), buf.Bytes(), 0755)
+}
+
+// addPeer adds addr to the list of peers in the cluster.
+func (r *remoteRaft) addPeer(addr string) error {
+	return fmt.Errorf("cannot add peer using remote raft")
+}
+
+func (r *remoteRaft) peers() ([]string, error) {
+	return readPeersJSON(filepath.Join(r.store.path, "peers.json"))
+}
+
+func (r *remoteRaft) open() error {
+	if err := r.setPeers(r.store.peers); err != nil {
+		return err
+	}
+
+	go func() {
+		for {
+			select {
+			case <-r.store.closing:
+				return
+			default:
+			}
+
+			ms, err := r.store.rpc.fetchMetaData(true)
+			if err != nil {
+				r.store.Logger.Printf("fetch metastore: %v", err)
+				time.Sleep(time.Second)
+				continue
+			}
+			r.updateMetaData(ms)
+		}
+	}()
+	return nil
+}
+
+func (r *remoteRaft) close() error {
+	return nil
+}
+
+// apply applies a serialized command to the raft log.
+func (r *remoteRaft) apply(b []byte) error {
+	return fmt.Errorf("cannot apply log while in remote raft state")
+}
+
+func (r *remoteRaft) initialize() error {
+	return nil
+}
+
+func (r *remoteRaft) leader() string {
+	if len(r.store.peers) == 0 {
+		return ""
+	}
+
+	return r.store.peers[rand.Intn(len(r.store.peers))]
+}
+
+func (r *remoteRaft) isLeader() bool {
+	return false
+}
+
+func (r *remoteRaft) lastIndex() uint64 {
+	return r.store.cachedData().Index
+}
+
+func (r *remoteRaft) sync(index uint64, timeout time.Duration) error {
+	//FIXME: jwilder: check index and timeout
+	return r.store.invalidate()
+}
+
+func (r *remoteRaft) snapshot() error {
+	return fmt.Errorf("cannot snapshot while in remote raft state")
+}
+
+func readPeersJSON(path string) ([]string, error) {
+	// Read the file
+	buf, err := ioutil.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+
+	// Check for no peers
+	if len(buf) == 0 {
+		return nil, nil
+	}
+
+	// Decode the peers
+	var peers []string
+	dec := json.NewDecoder(bytes.NewReader(buf))
+	if err := dec.Decode(&peers); err != nil {
+		return nil, err
+	}
+
+	return peers, nil
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/statement_executor.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/statement_executor.go
@ -10,6 +10,7 @@ import (
 type StatementExecutor struct {
 	Store interface {
 		Nodes() ([]NodeInfo, error)
+		Peers() ([]string, error)

 		Database(name string) (*DatabaseInfo, error)
 		Databases() ([]DatabaseInfo, error)
@ -127,9 +128,14 @@ func (e *StatementExecutor) executeShowServersStatement(q *influxql.ShowServersS
 		return &influxql.Result{Err: err}
 	}

-	row := &influxql.Row{Columns: []string{"id", "url"}}
+	peers, err := e.Store.Peers()
+	if err != nil {
+		return &influxql.Result{Err: err}
+	}
+
+	row := &influxql.Row{Columns: []string{"id", "cluster_addr", "raft"}}
 	for _, ni := range nis {
-		row.Values = append(row.Values, []interface{}{ni.ID, "http://" + ni.Host})
+		row.Values = append(row.Values, []interface{}{ni.ID, ni.Host, contains(peers, ni.Host)})
 	}
 	return &influxql.Result{Series: []*influxql.Row{row}}
 }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/statement_executor_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/statement_executor_test.go
@ -121,15 +121,18 @@ func TestStatementExecutor_ExecuteStatement_ShowServers(t *testing.T) {
 			{ID: 2, Host: "node1"},
 		}, nil
 	}
+	e.Store.PeersFn = func() ([]string, error) {
+		return []string{"node0"}, nil
+	}

 	if res := e.ExecuteStatement(influxql.MustParseStatement(`SHOW SERVERS`)); res.Err != nil {
 		t.Fatal(res.Err)
 	} else if !reflect.DeepEqual(res.Series, influxql.Rows{
 		{
-			Columns: []string{"id", "url"},
+			Columns: []string{"id", "cluster_addr", "raft"},
 			Values: [][]interface{}{
-				{uint64(1), "http://node0"},
-				{uint64(2), "http://node1"},
+				{uint64(1), "node0", true},
+				{uint64(2), "node1", false},
 			},
 		},
 	}) {
@ -778,6 +781,7 @@ func NewStatementExecutor() *StatementExecutor {
 // StatementExecutorStore represents a mock implementation of StatementExecutor.Store.
 type StatementExecutorStore struct {
 	NodesFn                     func() ([]meta.NodeInfo, error)
+	PeersFn                     func() ([]string, error)
 	DatabaseFn                  func(name string) (*meta.DatabaseInfo, error)
 	DatabasesFn                 func() ([]meta.DatabaseInfo, error)
 	CreateDatabaseFn            func(name string) (*meta.DatabaseInfo, error)
@ -804,6 +808,10 @@ func (s *StatementExecutorStore) Nodes() ([]meta.NodeInfo, error) {
 	return s.NodesFn()
 }

+func (s *StatementExecutorStore) Peers() ([]string, error) {
+	return s.PeersFn()
+}
+
 func (s *StatementExecutorStore) Database(name string) (*meta.DatabaseInfo, error) {
 	return s.DatabaseFn(name)
 }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/store.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/store.go
@ -21,7 +21,6 @@ import (

 	"github.com/gogo/protobuf/proto"
 	"github.com/hashicorp/raft"
-	"github.com/hashicorp/raft-boltdb"
 	"github.com/influxdb/influxdb/influxql"
 	"github.com/influxdb/influxdb/meta/internal"
 	"golang.org/x/crypto/bcrypt"
@ -31,9 +30,12 @@ import (
 const (
 	MuxRaftHeader = 0
 	MuxExecHeader = 1
+	MuxRPCHeader  = 5

 	// SaltBytes is the number of bytes used for salts
 	SaltBytes = 32
+
+	DefaultSyncNodeDelay = time.Second
 )

 // ExecMagic is the first 4 bytes sent to a remote exec connection to verify
@ -45,6 +47,10 @@ const (
 	AutoCreateRetentionPolicyName   = "default"
 	AutoCreateRetentionPolicyPeriod = 0
 	RetentionPolicyMinDuration      = time.Hour
+
+	// MaxAutoCreatedRetentionPolicyReplicaN is the maximum replication factor that will
+	// be set for auto-created retention policies.
+	MaxAutoCreatedRetentionPolicyReplicaN = 3
 )

 // Raft configuration.
@ -53,6 +59,7 @@ const (
 	raftSnapshotsRetained = 2
 	raftTransportMaxPool  = 3
 	raftTransportTimeout  = 10 * time.Second
+	MaxRaftNodes          = 3
 )

 // Store represents a raft-backed metastore.
@ -68,17 +75,22 @@ type Store struct {

 	data *Data

-	remoteAddr net.Addr
-	raft       *raft.Raft
-	raftLayer  *raftLayer
-	peerStore  raft.PeerStore
-	transport  *raft.NetworkTransport
-	store      *raftboltdb.BoltStore
+	rpc *rpc
+
+	// The address used by other nodes to reach this node.
+	RemoteAddr net.Addr
+
+	raftState raftState

 	ready   chan struct{}
 	err     chan error
 	closing chan struct{}
 	wg      sync.WaitGroup
+	changed chan struct{}
+
+	// clusterTracingEnabled controls whether low-level cluster communcation is logged.
+	// Useful for troubleshooting
+	clusterTracingEnabled bool

 	retentionAutoCreate bool

@ -86,6 +98,9 @@ type Store struct {
 	RaftListener net.Listener
 	ExecListener net.Listener

+	// The listener for higher-level, cluster operations
+	RPCListener net.Listener
+
 	// The advertised hostname of the store.
 	Addr net.Addr

@ -118,8 +133,8 @@ type authUser struct {
 }

 // NewStore returns a new instance of Store.
-func NewStore(c Config) *Store {
-	return &Store{
+func NewStore(c *Config) *Store {
+	s := &Store{
 		path:  c.Dir,
 		peers: c.Peers,
 		data:  &Data{},
@ -127,8 +142,10 @@ func NewStore(c Config) *Store {
 		ready:   make(chan struct{}),
 		err:     make(chan error),
 		closing: make(chan struct{}),
+		changed: make(chan struct{}),

-		retentionAutoCreate: c.RetentionAutoCreate,
+		clusterTracingEnabled: c.ClusterTracing,
+		retentionAutoCreate:   c.RetentionAutoCreate,

 		HeartbeatTimeout:   time.Duration(c.HeartbeatTimeout),
 		ElectionTimeout:    time.Duration(c.ElectionTimeout),
@ -140,6 +157,14 @@ func NewStore(c Config) *Store {
 		},
 		Logger: log.New(os.Stderr, "[metastore] ", log.LstdFlags),
 	}
+
+	s.raftState = &localRaft{store: s}
+	s.rpc = &rpc{
+		store:          s,
+		tracingEnabled: c.ClusterTracing,
+		logger:         s.Logger,
+	}
+	return s
 }

 // Path returns the root path when open.
@ -153,7 +178,7 @@ func (s *Store) IDPath() string { return filepath.Join(s.path, "id") }
 func (s *Store) Open() error {
 	// Verify that no more than 3 peers.
 	// https://github.com/influxdb/influxdb/issues/2750
-	if len(s.peers) > 3 {
+	if len(s.peers) > MaxRaftNodes {
 		return ErrTooManyPeers
 	}

@ -162,8 +187,12 @@ func (s *Store) Open() error {
 		panic("Store.RaftListener not set")
 	} else if s.ExecListener == nil {
 		panic("Store.ExecListener not set")
+	} else if s.RPCListener == nil {
+		panic("Store.RPCListener not set")
 	}

+	s.Logger.Printf("Using data dir: %v", s.Path())
+
 	if err := func() error {
 		s.mu.Lock()
 		defer s.mu.Unlock()
@ -174,8 +203,13 @@ func (s *Store) Open() error {
 		}
 		s.opened = true

+		// load our raft state
+		if err := s.loadState(); err != nil {
+			return err
+		}
+
 		// Create the root directory if it doesn't already exist.
-		if err := os.MkdirAll(s.path, 0777); err != nil {
+		if err := s.createRootDir(); err != nil {
 			return fmt.Errorf("mkdir all: %s", err)
 		}

@ -204,78 +238,186 @@ func (s *Store) Open() error {
 	s.wg.Add(1)
 	go s.serveExecListener()

+	s.wg.Add(1)
+	go s.serveRPCListener()
+
+	// Join an existing cluster if we needed
+	if err := s.joinCluster(); err != nil {
+		return fmt.Errorf("join: %v", err)
+	}
+
 	// If the ID doesn't exist then create a new node.
 	if s.id == 0 {
 		go s.init()
 	} else {
+		go s.syncNodeInfo()
 		close(s.ready)
 	}

 	return nil
 }

+// syncNodeInfo continuously tries to update the current nodes hostname
+// in the meta store.  It will retry until successful.
+func (s *Store) syncNodeInfo() error {
+	<-s.ready
+
+	for {
+		if err := func() error {
+			if err := s.WaitForLeader(0); err != nil {
+				return err
+			}
+
+			ni, err := s.Node(s.id)
+			if err != nil {
+				return err
+			}
+
+			if ni == nil {
+				return ErrNodeNotFound
+			}
+
+			if ni.Host == s.RemoteAddr.String() {
+				s.Logger.Printf("Updated node id=%d hostname=%v", s.id, s.RemoteAddr.String())
+				return nil
+			}
+
+			_, err = s.UpdateNode(s.id, s.RemoteAddr.String())
+			if err != nil {
+				return err
+			}
+			return nil
+		}(); err != nil {
+			// If we get an error, the cluster has not stabilized so just try again
+			time.Sleep(DefaultSyncNodeDelay)
+			continue
+		}
+		return nil
+	}
+}
+
+// loadState sets the appropriate raftState from our persistent storage
+func (s *Store) loadState() error {
+	peers, err := readPeersJSON(filepath.Join(s.path, "peers.json"))
+	if err != nil {
+		return err
+	}
+
+	// If we have existing peers, use those.  This will override what's in the
+	// config.
+	if len(peers) > 0 {
+		s.peers = peers
+	}
+
+	// if no peers on disk, we need to start raft in order to initialize a new
+	// cluster or join an existing one.
+	if len(peers) == 0 {
+		s.raftState = &localRaft{store: s}
+		// if we have a raft database, (maybe restored), we should start raft locally
+	} else if _, err := os.Stat(filepath.Join(s.path, "raft.db")); err == nil {
+		s.raftState = &localRaft{store: s}
+		// otherwise, we should use remote raft
+	} else {
+		s.raftState = &remoteRaft{store: s}
+	}
+	return nil
+}
+
+func (s *Store) joinCluster() error {
+
+	// No join options, so nothing to do
+	if len(s.peers) == 0 {
+		return nil
+	}
+
+	// We already have a node ID so were already part of a cluster,
+	// don't join again so we can use our existing state.
+	if s.id != 0 {
+		s.Logger.Printf("Skipping cluster join: already member of cluster: nodeId=%v raftEnabled=%v peers=%v",
+			s.id, raft.PeerContained(s.peers, s.RemoteAddr.String()), s.peers)
+		return nil
+	}
+
+	s.Logger.Printf("Joining cluster at: %v", s.peers)
+	for {
+		for _, join := range s.peers {
+			res, err := s.rpc.join(s.RemoteAddr.String(), join)
+			if err != nil {
+				s.Logger.Printf("Join node %v failed: %v: retrying...", join, err)
+				continue
+			}
+
+			s.Logger.Printf("Joined remote node %v", join)
+			s.Logger.Printf("nodeId=%v raftEnabled=%v peers=%v", res.NodeID, res.RaftEnabled, res.RaftNodes)
+
+			s.peers = res.RaftNodes
+			s.id = res.NodeID
+
+			if err := s.writeNodeID(res.NodeID); err != nil {
+				s.Logger.Printf("Write node id failed: %v", err)
+				break
+			}
+
+			if !res.RaftEnabled {
+				// Shutdown our local raft and transition to a remote raft state
+				if err := s.enableRemoteRaft(); err != nil {
+					s.Logger.Printf("Enable remote raft failed: %v", err)
+					break
+				}
+			}
+			return nil
+		}
+		time.Sleep(time.Second)
+	}
+}
+
+func (s *Store) enableLocalRaft() error {
+	if _, ok := s.raftState.(*localRaft); ok {
+		return nil
+	}
+	s.Logger.Printf("Switching to local raft")
+
+	lr := &localRaft{store: s}
+	return s.changeState(lr)
+}
+
+func (s *Store) enableRemoteRaft() error {
+	if _, ok := s.raftState.(*remoteRaft); ok {
+		return nil
+	}
+
+	s.Logger.Printf("Switching to remote raft")
+	rr := &remoteRaft{store: s}
+	return s.changeState(rr)
+}
+
+func (s *Store) changeState(state raftState) error {
+	if err := s.raftState.close(); err != nil {
+		return err
+	}
+
+	// Clear out any persistent state
+	if err := s.raftState.remove(); err != nil {
+		return err
+	}
+
+	s.raftState = state
+
+	if err := s.raftState.open(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // openRaft initializes the raft store.
 func (s *Store) openRaft() error {
-	// Setup raft configuration.
-	config := raft.DefaultConfig()
-	config.Logger = s.Logger
-	config.HeartbeatTimeout = s.HeartbeatTimeout
-	config.ElectionTimeout = s.ElectionTimeout
-	config.LeaderLeaseTimeout = s.LeaderLeaseTimeout
-	config.CommitTimeout = s.CommitTimeout
-
-	// If no peers are set in the config then start as a single server.
-	config.EnableSingleNode = (len(s.peers) == 0)
-
-	// Build raft layer to multiplex listener.
-	s.raftLayer = newRaftLayer(s.RaftListener, s.Addr)
-
-	// Create a transport layer
-	s.transport = raft.NewNetworkTransport(s.raftLayer, 3, 10*time.Second, os.Stderr)
-
-	// Create peer storage.
-	s.peerStore = raft.NewJSONPeers(s.path, s.transport)
-
-	// Create the log store and stable store.
-	store, err := raftboltdb.NewBoltStore(filepath.Join(s.path, "raft.db"))
-	if err != nil {
-		return fmt.Errorf("new bolt store: %s", err)
-	}
-	s.store = store
-
-	// Create the snapshot store.
-	snapshots, err := raft.NewFileSnapshotStore(s.path, raftSnapshotsRetained, os.Stderr)
-	if err != nil {
-		return fmt.Errorf("file snapshot store: %s", err)
-	}
-
-	// Create raft log.
-	r, err := raft.NewRaft(config, (*storeFSM)(s), store, store, snapshots, s.peerStore, s.transport)
-	if err != nil {
-		return fmt.Errorf("new raft: %s", err)
-	}
-	s.raft = r
-
-	return nil
+	return s.raftState.open()
 }

 // initialize attempts to bootstrap the raft store if there are no committed entries.
 func (s *Store) initialize() error {
-	// If we have committed entries then the store is already in the cluster.
-	/*
-		if index, err := s.store.LastIndex(); err != nil {
-			return fmt.Errorf("last index: %s", err)
-		} else if index > 0 {
-			return nil
-		}
-	*/
-
-	// Force set peers.
-	if err := s.SetPeers(s.peers); err != nil {
-		return fmt.Errorf("set raft peers: %s", err)
-	}
-
-	return nil
+	return s.raftState.initialize()
 }

 // Close closes the store and shuts down the node in the cluster.
@ -285,6 +427,23 @@ func (s *Store) Close() error {
 	return s.close()
 }

+// WaitForDataChanged will block the current goroutine until the metastore index has
+// be updated.
+func (s *Store) WaitForDataChanged() error {
+	s.mu.RLock()
+	changed := s.changed
+	s.mu.RUnlock()
+
+	for {
+		select {
+		case <-s.closing:
+			return errors.New("closing")
+		case <-changed:
+			return nil
+		}
+	}
+}
+
 func (s *Store) close() error {
 	// Check if store has already been closed.
 	if !s.opened {
@ -296,18 +455,9 @@ func (s *Store) close() error {
 	close(s.closing)
 	// FIXME(benbjohnson): s.wg.Wait()

-	// Shutdown raft.
-	if s.raft != nil {
-		s.raft.Shutdown()
-		s.raft = nil
-	}
-	if s.transport != nil {
-		s.transport.Close()
-		s.transport = nil
-	}
-	if s.store != nil {
-		s.store.Close()
-		s.store = nil
+	if s.raftState != nil {
+		s.raftState.close()
+		s.raftState = nil
 	}

 	return nil
@ -329,8 +479,6 @@ func (s *Store) readID() error {
 	}
 	s.id = id

-	s.Logger.Printf("read local node id: %d", s.id)
-
 	return nil
 }

@ -357,37 +505,43 @@ func (s *Store) createLocalNode() error {
 	}

 	// Create new node.
-	ni, err := s.CreateNode(s.Addr.String())
+	ni, err := s.CreateNode(s.RemoteAddr.String())
 	if err != nil {
 		return fmt.Errorf("create node: %s", err)
 	}

 	// Write node id to file.
-	if err := ioutil.WriteFile(s.IDPath(), []byte(strconv.FormatUint(ni.ID, 10)), 0666); err != nil {
+	if err := s.writeNodeID(ni.ID); err != nil {
 		return fmt.Errorf("write file: %s", err)
 	}

 	// Set ID locally.
 	s.id = ni.ID

-	s.Logger.Printf("created local node: id=%d, host=%s", s.id, s.Addr.String())
+	s.Logger.Printf("Created local node: id=%d, host=%s", s.id, s.RemoteAddr)

 	return nil
 }

+func (s *Store) createRootDir() error {
+	return os.MkdirAll(s.path, 0777)
+}
+
+func (s *Store) writeNodeID(id uint64) error {
+	if err := s.createRootDir(); err != nil {
+		return err
+	}
+	return ioutil.WriteFile(s.IDPath(), []byte(strconv.FormatUint(id, 10)), 0666)
+}
+
 // Snapshot saves a snapshot of the current state.
 func (s *Store) Snapshot() error {
-	future := s.raft.Snapshot()
-	return future.Error()
+	return s.raftState.snapshot()
 }

 // WaitForLeader sleeps until a leader is found or a timeout occurs.
 // timeout == 0 means to wait forever.
 func (s *Store) WaitForLeader(timeout time.Duration) error {
-	if s.raft.Leader() != "" {
-		return nil
-	}
-
 	// Begin timeout timer.
 	timer := time.NewTimer(timeout)
 	defer timer.Stop()
@ -404,7 +558,7 @@ func (s *Store) WaitForLeader(timeout time.Duration) error {
 				return errors.New("timeout")
 			}
 		case <-ticker.C:
-			if s.raft.Leader() != "" {
+			if s.Leader() != "" {
 				return nil
 			}
 		}
@ -421,10 +575,10 @@ func (s *Store) Err() <-chan error { return s.err }
 func (s *Store) IsLeader() bool {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
-	if s.raft == nil {
+	if s.raftState == nil {
 		return false
 	}
-	return s.raft.State() == raft.Leader
+	return s.raftState.isLeader()
 }

 // Leader returns what the store thinks is the current leader. An empty
@ -432,32 +586,27 @@ func (s *Store) IsLeader() bool {
 func (s *Store) Leader() string {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
-	if s.raft == nil {
+	if s.raftState == nil {
 		return ""
 	}
-	return s.raft.Leader()
-}
-
-// LeaderCh returns a channel that notifies on leadership change.
-// Panics when the store has not been opened yet.
-func (s *Store) LeaderCh() <-chan bool {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	assert(s.raft != nil, "cannot retrieve leadership channel when closed")
-	return s.raft.LeaderCh()
+	return s.raftState.leader()
 }

 // SetPeers sets a list of peers in the cluster.
 func (s *Store) SetPeers(addrs []string) error {
-	a := make([]string, len(addrs))
-	for i, s := range addrs {
-		addr, err := net.ResolveTCPAddr("tcp", s)
-		if err != nil {
-			return fmt.Errorf("cannot resolve addr: %s, err=%s", s, err)
-		}
-		a[i] = addr.String()
-	}
-	return s.raft.SetPeers(a).Error()
+	return s.raftState.setPeers(addrs)
+}
+
+// AddPeer adds addr to the list of peers in the cluster.
+func (s *Store) AddPeer(addr string) error {
+	return s.raftState.addPeer(addr)
+}
+
+// Peers returns the list of peers in the cluster.
+func (s *Store) Peers() ([]string, error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.raftState.peers()
 }

 // serveExecListener processes remote exec connections.
@ -471,10 +620,9 @@ func (s *Store) serveExecListener() {
 		if err != nil {
 			if strings.Contains(err.Error(), "connection closed") {
 				return
-			} else {
-				s.Logger.Printf("temporary accept error: %s", err)
-				continue
 			}
+			s.Logger.Printf("temporary accept error: %s", err)
+			continue
 		}

 		// Handle connection in a separate goroutine.
@ -487,6 +635,31 @@ func (s *Store) serveExecListener() {
 func (s *Store) handleExecConn(conn net.Conn) {
 	defer s.wg.Done()

+	// Nodes not part of the raft cluster may initiate remote exec commands
+	// but may not know who the current leader of the cluster.  If we are not
+	// the leader, proxy the request to the current leader.
+	if !s.IsLeader() {
+
+		if s.Leader() == s.RemoteAddr.String() {
+			s.Logger.Printf("No leader")
+			return
+		}
+
+		leaderConn, err := net.DialTimeout("tcp", s.Leader(), 10*time.Second)
+		if err != nil {
+			s.Logger.Printf("Dial leader: %v", err)
+			return
+		}
+		defer leaderConn.Close()
+		leaderConn.Write([]byte{MuxExecHeader})
+
+		if err := proxy(leaderConn.(*net.TCPConn), conn.(*net.TCPConn)); err != nil {
+			s.Logger.Printf("Leader proxy error: %v", err)
+		}
+		conn.Close()
+		return
+	}
+
 	// Read and execute command.
 	err := func() error {
 		// Read marker message.
@ -524,7 +697,7 @@ func (s *Store) handleExecConn(conn net.Conn) {
 	// Build response message.
 	var resp internal.Response
 	resp.OK = proto.Bool(err == nil)
-	resp.Index = proto.Uint64(s.raft.LastIndex())
+	resp.Index = proto.Uint64(s.raftState.lastIndex())
 	if err != nil {
 		resp.Error = proto.String(err.Error())
 	}
@ -533,13 +706,39 @@ func (s *Store) handleExecConn(conn net.Conn) {
 	if b, err := proto.Marshal(&resp); err != nil {
 		panic(err)
 	} else if err = binary.Write(conn, binary.BigEndian, uint64(len(b))); err != nil {
-		s.Logger.Printf("unable to write exec response size: %s", err)
+		s.Logger.Printf("Unable to write exec response size: %s", err)
 	} else if _, err = conn.Write(b); err != nil {
-		s.Logger.Printf("unable to write exec response: %s", err)
+		s.Logger.Printf("Unable to write exec response: %s", err)
 	}
 	conn.Close()
 }

+// serveRPCListener processes remote exec connections.
+// This function runs in a separate goroutine.
+func (s *Store) serveRPCListener() {
+	defer s.wg.Done()
+
+	for {
+		// Accept next TCP connection.
+		conn, err := s.RPCListener.Accept()
+		if err != nil {
+			if strings.Contains(err.Error(), "connection closed") {
+				return
+			} else {
+				s.Logger.Printf("temporary accept error: %s", err)
+				continue
+			}
+		}
+
+		// Handle connection in a separate goroutine.
+		s.wg.Add(1)
+		go func() {
+			defer s.wg.Done()
+			s.rpc.handleRPCConn(conn)
+		}()
+	}
+}
+
 // MarshalBinary encodes the store's data to a binary protobuf format.
 func (s *Store) MarshalBinary() ([]byte, error) {
 	s.mu.RLock()
@ -607,6 +806,19 @@ func (s *Store) CreateNode(host string) (*NodeInfo, error) {
 	return s.NodeByHost(host)
 }

+// UpdateNode updates an existing node in the store.
+func (s *Store) UpdateNode(id uint64, host string) (*NodeInfo, error) {
+	if err := s.exec(internal.Command_UpdateNodeCommand, internal.E_UpdateNodeCommand_Command,
+		&internal.UpdateNodeCommand{
+			ID:   proto.Uint64(id),
+			Host: proto.String(host),
+		},
+	); err != nil {
+		return nil, err
+	}
+	return s.NodeByHost(host)
+}
+
 // DeleteNode removes a node from the metastore by id.
 func (s *Store) DeleteNode(id uint64) error {
 	return s.exec(internal.Command_DeleteNodeCommand, internal.E_DeleteNodeCommand_Command,
@ -658,6 +870,10 @@ func (s *Store) CreateDatabase(name string) (*DatabaseInfo, error) {
 			return nil, fmt.Errorf("read: %s", err)
 		}

+		if nodeN > MaxAutoCreatedRetentionPolicyReplicaN {
+			nodeN = MaxAutoCreatedRetentionPolicyReplicaN
+		}
+
 		// Create a retention policy.
 		rpi := NewRetentionPolicyInfo(AutoCreateRetentionPolicyName)
 		rpi.ReplicaN = nodeN
@ -685,11 +901,11 @@ func (s *Store) CreateDatabaseIfNotExists(name string) (*DatabaseInfo, error) {
 	}

 	// Attempt to create database.
-	if di, err := s.CreateDatabase(name); err == ErrDatabaseExists {
+	di, err := s.CreateDatabase(name)
+	if err == ErrDatabaseExists {
 		return s.Database(name)
-	} else {
-		return di, err
 	}
+	return di, err
 }

 // DropDatabase removes a database from the metastore by name.
@ -774,11 +990,11 @@ func (s *Store) CreateRetentionPolicyIfNotExists(database string, rpi *Retention
 	}

 	// Attempt to create policy.
-	if other, err := s.CreateRetentionPolicy(database, rpi); err == ErrRetentionPolicyExists {
+	other, err := s.CreateRetentionPolicy(database, rpi)
+	if err == ErrRetentionPolicyExists {
 		return s.RetentionPolicy(database, rpi.Name)
-	} else {
-		return other, err
 	}
+	return other, err
 }

 // SetDefaultRetentionPolicy sets the default retention policy for a database.
@ -858,11 +1074,11 @@ func (s *Store) CreateShardGroupIfNotExists(database, policy string, timestamp t
 	}

 	// Attempt to create database.
-	if sgi, err := s.CreateShardGroup(database, policy, timestamp); err == ErrShardGroupExists {
+	sgi, err := s.CreateShardGroup(database, policy, timestamp)
+	if err == ErrShardGroupExists {
 		return s.ShardGroupByTimestamp(database, policy, timestamp)
-	} else {
-		return sgi, err
 	}
+	return sgi, err
 }

 // DeleteShardGroup removes an existing shard group from a policy by ID.
@ -1037,9 +1253,8 @@ func (s *Store) Authenticate(username, password string) (ui *UserInfo, err error
 			if bytes.Equal(hashed, au.hash) {
 				ui = u
 				return nil
-			} else {
-				return ErrAuthenticate
 			}
+			return ErrAuthenticate
 		}

 		// Compare password with user hash.
@ -1264,8 +1479,7 @@ func (s *Store) read(fn func(*Data) error) error {
 var errInvalidate = errors.New("invalidate cache")

 func (s *Store) invalidate() error {
-	time.Sleep(1 * time.Second)
-	return nil // FIXME(benbjohnson): Reload cache from the leader.
+	return s.raftState.invalidate()
 }

 func (s *Store) exec(typ internal.Command_Type, desc *proto.ExtensionDesc, value interface{}) error {
@ -1280,36 +1494,21 @@ func (s *Store) exec(typ internal.Command_Type, desc *proto.ExtensionDesc, value

 	// Apply the command if this is the leader.
 	// Otherwise remotely execute the command against the current leader.
-	if s.raft.State() == raft.Leader {
+	if s.raftState.isLeader() {
 		return s.apply(b)
-	} else {
-		return s.remoteExec(b)
 	}
+	return s.remoteExec(b)
 }

 // apply applies a serialized command to the raft log.
 func (s *Store) apply(b []byte) error {
-	// Apply to raft log.
-	f := s.raft.Apply(b, 0)
-	if err := f.Error(); err != nil {
-		return err
-	}
-
-	// Return response if it's an error.
-	// No other non-nil objects should be returned.
-	resp := f.Response()
-	if err, ok := resp.(error); ok {
-		return lookupError(err)
-	}
-	assert(resp == nil, "unexpected response: %#v", resp)
-
-	return nil
+	return s.raftState.apply(b)
 }

 // remoteExec sends an encoded command to the remote leader.
 func (s *Store) remoteExec(b []byte) error {
 	// Retrieve the current known leader.
-	leader := s.raft.Leader()
+	leader := s.raftState.leader()
 	if leader == "" {
 		return errors.New("no leader")
 	}
@ -1368,30 +1567,13 @@ func (s *Store) remoteExec(b []byte) error {

 // sync polls the state machine until it reaches a given index.
 func (s *Store) sync(index uint64, timeout time.Duration) error {
-	ticker := time.NewTicker(100 * time.Millisecond)
-	defer ticker.Stop()
+	return s.raftState.sync(index, timeout)
+}

-	timer := time.NewTimer(timeout)
-	defer timer.Stop()
-
-	for {
-		// Wait for next tick or timeout.
-		select {
-		case <-ticker.C:
-		case <-timer.C:
-			return errors.New("timeout")
-		}
-
-		// Compare index against current metadata.
-		s.mu.Lock()
-		ok := (s.data.Index >= index)
-		s.mu.Unlock()
-
-		// Exit if we are at least at the given index.
-		if ok {
-			return nil
-		}
-	}
+func (s *Store) cachedData() *Data {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.data.Clone()
 }

 // BcryptCost is the cost associated with generating password with Bcrypt.
@ -1467,6 +1649,8 @@ func (fsm *storeFSM) Apply(l *raft.Log) interface{} {
 			return fsm.applySetAdminPrivilegeCommand(&cmd)
 		case internal.Command_SetDataCommand:
 			return fsm.applySetDataCommand(&cmd)
+		case internal.Command_UpdateNodeCommand:
+			return fsm.applyUpdateNodeCommand(&cmd)
 		default:
 			panic(fmt.Errorf("cannot apply command: %x", l.Data))
 		}
@ -1475,6 +1659,8 @@ func (fsm *storeFSM) Apply(l *raft.Log) interface{} {
 	// Copy term and index to new metadata.
 	fsm.data.Term = l.Term
 	fsm.data.Index = l.Index
+	close(s.changed)
+	s.changed = make(chan struct{})

 	return err
 }
@ -1498,6 +1684,23 @@ func (fsm *storeFSM) applyCreateNodeCommand(cmd *internal.Command) interface{} {
 	return nil
 }

+func (fsm *storeFSM) applyUpdateNodeCommand(cmd *internal.Command) interface{} {
+	ext, _ := proto.GetExtension(cmd, internal.E_UpdateNodeCommand_Command)
+	v := ext.(*internal.UpdateNodeCommand)
+
+	// Copy data and update.
+	other := fsm.data.Clone()
+	ni := other.Node(v.GetID())
+	if ni == nil {
+		return ErrNodeNotFound
+	}
+
+	ni.Host = v.GetHost()
+
+	fsm.data = other
+	return nil
+}
+
 func (fsm *storeFSM) applyDeleteNodeCommand(cmd *internal.Command) interface{} {
 	ext, _ := proto.GetExtension(cmd, internal.E_DeleteNodeCommand_Command)
 	v := ext.(*internal.DeleteNodeCommand)
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/store_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/meta/store_test.go
@ -218,14 +218,18 @@ func TestStore_DropDatabase(t *testing.T) {
 	}

 	// Ensure remaining nodes are correct.
-	if di, _ := s.Database("db0"); !reflect.DeepEqual(di, &meta.DatabaseInfo{Name: "db0"}) {
-		t.Fatalf("unexpected database(0): %#v", di)
+	exp := &meta.DatabaseInfo{Name: "db0"}
+	if di, _ := s.Database("db0"); !reflect.DeepEqual(di, exp) {
+		t.Fatalf("unexpected database(0): \ngot: %#v\nexp: %#v", di, exp)
+
 	}
 	if di, _ := s.Database("db1"); di != nil {
 		t.Fatalf("unexpected database(1): %#v", di)
 	}
-	if di, _ := s.Database("db2"); !reflect.DeepEqual(di, &meta.DatabaseInfo{Name: "db2"}) {
-		t.Fatalf("unexpected database(2): %#v", di)
+
+	exp = &meta.DatabaseInfo{Name: "db2"}
+	if di, _ := s.Database("db2"); !reflect.DeepEqual(di, exp) {
+		t.Fatalf("unexpected database(2): \ngot: %#v\nexp: %#v", di, exp)
 	}
 }

@ -300,8 +304,9 @@ func TestStore_DropRetentionPolicy(t *testing.T) {
 	if rpi, _ := s.RetentionPolicy("db0", "rp1"); rpi != nil {
 		t.Fatalf("unexpected policy(1): %#v", rpi)
 	}
-	if rpi, _ := s.RetentionPolicy("db0", "rp2"); !reflect.DeepEqual(rpi, &meta.RetentionPolicyInfo{Name: "rp2", ReplicaN: 1, ShardGroupDuration: 7 * 24 * time.Hour}) {
-		t.Fatalf("unexpected policy(2): %#v", rpi)
+	exp := &meta.RetentionPolicyInfo{Name: "rp2", ReplicaN: 1, ShardGroupDuration: 7 * 24 * time.Hour}
+	if rpi, _ := s.RetentionPolicy("db0", "rp2"); !reflect.DeepEqual(rpi, exp) {
+		t.Fatalf("unexpected policy(2): \ngot: %#v\nexp: %#v", rpi, exp)
 	}
 }

@ -730,6 +735,7 @@ func TestStore_Snapshot_And_Restore(t *testing.T) {

 	s := MustOpenStore()
 	s.LeaveFiles = true
+	addr := s.RemoteAddr.String()

 	// Create a bunch of databases in the Store
 	nDatabases := 5
@ -744,12 +750,12 @@ func TestStore_Snapshot_And_Restore(t *testing.T) {

 	s.Close()

+	// Allow the kernel to free up the port so we can re-use it again
+	time.Sleep(100 * time.Millisecond)
+
 	// Test restoring the snapshot taken above.
 	existingDataPath := s.Path()
-	s = NewStore(NewConfig(existingDataPath))
-	if err := s.Open(); err != nil {
-		panic(err)
-	}
+	s = MustOpenStoreWithPath(addr, existingDataPath)
 	defer s.Close()

 	// Wait until the server is ready.
@ -782,37 +788,105 @@ func TestCluster_Open(t *testing.T) {
 		t.Fatal("no leader found")
 	}

-	// Add a database to each node.
-	for i, s := range c.Stores {
-		if di, err := s.CreateDatabase(fmt.Sprintf("db%d", i)); err != nil {
-			t.Fatal(err)
-		} else if di == nil {
-			t.Fatal("expected database")
+	// ensure all the nodes see the same metastore data
+	assertDatabaseReplicated(t, c)
+}
+
+// Ensure a multi-node cluster can start, join the cluster, and the first three members are raft nodes.
+func TestCluster_OpenRaft(t *testing.T) {
+	// Start a single node.
+	c := MustOpenCluster(1)
+	defer c.Close()
+
+	// Check that the node becomes leader.
+	if s := c.Leader(); s == nil {
+		t.Fatal("no leader found")
+	}
+
+	// Add 5 more nodes.
+	for i := 0; i < 5; i++ {
+		if err := c.Join(); err != nil {
+			t.Fatalf("failed to join cluster: %v", err)
 		}
 	}

-	// Verify that each store has all databases.
-	for i := 0; i < len(c.Stores); i++ {
-		for _, s := range c.Stores {
-			if di, err := s.Database(fmt.Sprintf("db%d", i)); err != nil {
-				t.Fatal(err)
-			} else if di == nil {
-				t.Fatal("expected database")
-			}
+	// ensure we have 3 raft nodes
+	assertRaftPeerNodes(t, c, 3)
+
+	// ensure all the nodes see the same metastore data
+	assertDatabaseReplicated(t, c)
+}
+
+// Ensure a multi-node cluster can restart
+func TestCluster_Restart(t *testing.T) {
+	// Start a single node.
+	c := MustOpenCluster(1)
+	defer c.Close()
+
+	// Check that one node is leader.
+	if s := c.Leader(); s == nil {
+		t.Fatal("no leader found")
+	}
+
+	// Add 5 more ndes, 2 should become raft peers, 3 remote raft clients
+	for i := 0; i < 5; i++ {
+		if err := c.Join(); err != nil {
+			t.Fatalf("failed to join cluster: %v", err)
 		}
 	}
+
+	// The tests use a host host assigned listener port.  We need to re-use
+	// the original ports when the new cluster is restarted so that the existing
+	// peer store addresses can be reached.
+	addrs := []string{}
+
+	// Make sure we keep files on disk when we shutdown as well as record the
+	// current cluster IP addresses
+	for _, s := range c.Stores {
+		s.LeaveFiles = true
+		addrs = append(addrs, s.Addr.String())
+	}
+
+	// Stop the cluster
+	if err := c.Close(); err != nil {
+		t.Fatalf("failed to close cluster: %v", err)
+	}
+
+	// Wait a bit to avoid spurious port in use conflict errors from trying to
+	// start the new cluster to fast
+	time.Sleep(100 * time.Millisecond)
+
+	// Re-create the cluster nodes from existing disk paths and addresses
+	stores := []*Store{}
+	for i, s := range c.Stores {
+		store := MustOpenStoreWithPath(addrs[i], s.Path())
+		stores = append(stores, store)
+	}
+	c.Stores = stores
+
+	// Wait for the cluster to stabilize
+	if err := c.WaitForLeader(); err != nil {
+		t.Fatal("no leader found")
+	}
+
+	// ensure we have 3 raft nodes
+	assertRaftPeerNodes(t, c, 3)
+
+	// ensure all the nodes see the same metastore data
+	assertDatabaseReplicated(t, c)
 }

 // Store is a test wrapper for meta.Store.
 type Store struct {
 	*meta.Store
-	Listener   net.Listener
-	Stderr     bytes.Buffer
-	LeaveFiles bool // set to true to leave temporary files on close
+	BindAddress string
+	Listener    net.Listener
+	Stderr      bytes.Buffer
+	LeaveFiles  bool // set to true to leave temporary files on close
 }

 // NewStore returns a new test wrapper for Store.
-func NewStore(c meta.Config) *Store {
+func NewStore(c *meta.Config) *Store {
 	s := &Store{
 		Store: meta.NewStore(c),
 	}
@ -823,7 +897,16 @@ func NewStore(c meta.Config) *Store {

 // MustOpenStore opens a store in a temporary path. Panic on error.
 func MustOpenStore() *Store {
-	s := NewStore(NewConfig(MustTempFile()))
+	return MustOpenStoreWithPath("", MustTempFile())
+}
+
+// MustOpenStoreWith opens a store from a given path. Panic on error.
+func MustOpenStoreWithPath(addr, path string) *Store {
+	c := NewConfig(path)
+	s := NewStore(c)
+	if addr != "" {
+		s.BindAddress = addr
+	}
 	if err := s.Open(); err != nil {
 		panic(err)
 	}
@ -840,18 +923,26 @@ func MustOpenStore() *Store {

 // Open opens the store on a random TCP port.
 func (s *Store) Open() error {
+
+	addr := "127.0.0.1:0"
+	if s.BindAddress != "" {
+		addr = s.BindAddress
+	}
 	// Open a TCP port.
-	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	ln, err := net.Listen("tcp", addr)
 	if err != nil {
 		return fmt.Errorf("listen: %s", err)
 	}
 	s.Addr = ln.Addr()
 	s.Listener = ln
+	s.RemoteAddr = s.Addr

 	// Wrap listener in a muxer.
 	mux := tcp.NewMux()
 	s.RaftListener = mux.Listen(meta.MuxRaftHeader)
 	s.ExecListener = mux.Listen(meta.MuxExecHeader)
+	s.RPCListener = mux.Listen(meta.MuxRPCHeader)
+
 	go mux.Serve(ln)

 	// Open store.
@ -874,8 +965,8 @@ func (s *Store) Close() error {
 }

 // NewConfig returns the default test configuration.
-func NewConfig(path string) meta.Config {
-	return meta.Config{
+func NewConfig(path string) *meta.Config {
+	return &meta.Config{
 		Dir:                path,
 		Hostname:           "localhost",
 		BindAddress:        "127.0.0.1:0",
@ -888,27 +979,17 @@ func NewConfig(path string) meta.Config {

 // Cluster represents a group of stores joined as a raft cluster.
 type Cluster struct {
+	path   string
 	Stores []*Store
+	n      int
 }

 // NewCluster returns a cluster of n stores within path.
 func NewCluster(path string, n int) *Cluster {
-	c := &Cluster{}
-
-	// Construct a list of temporary peers.
-	peers := make([]string, n)
-	for i := range peers {
-		peers[i] = "127.0.0.1:0"
-	}
-
-	// Create new stores with temporary peers.
-	for i := 0; i < n; i++ {
-		config := NewConfig(filepath.Join(path, strconv.Itoa(i)))
-		config.Peers = peers
-		s := NewStore(config)
-		c.Stores = append(c.Stores, s)
-	}
-
+	c := &Cluster{path: path, n: n}
+	config := NewConfig(filepath.Join(path, strconv.Itoa(0)))
+	s := NewStore(config)
+	c.Stores = append(c.Stores, s)
 	return c
 }

@ -930,22 +1011,34 @@ func MustOpenCluster(n int) *Cluster {
 	return c
 }

+func (c *Cluster) Join() error {
+	config := NewConfig(filepath.Join(c.path, strconv.Itoa(len(c.Stores))))
+	config.Peers = []string{c.Stores[0].Addr.String()}
+	s := NewStore(config)
+	if err := s.Open(); err != nil {
+		return err
+	}
+	select {
+	case err := <-s.Err():
+		panic(fmt.Sprintf("store: i=%d, addr=%s, err=%s", len(c.Stores), s.Addr.String(), err))
+	case <-s.Ready():
+	}
+
+	c.Stores = append(c.Stores, s)
+	return nil
+}
+
 // Open opens and initializes all stores in the cluster.
 func (c *Cluster) Open() error {
 	if err := func() error {
-		// Open each store and add to peer list.
-		peers := make([]string, len(c.Stores))
-		for i, s := range c.Stores {
-			if err := s.Open(); err != nil {
-				return fmt.Errorf("open test store #%d: %s", i, err)
-			}
-			peers[i] = s.Addr.String()
+
+		if err := c.Stores[0].Open(); err != nil {
+			return err
 		}

-		// Reset peers on all stores.
-		for _, s := range c.Stores {
-			if err := s.SetPeers(peers); err != nil {
-				return fmt.Errorf("set peers: %s", err)
+		for i := 1; i < c.n; i++ {
+			if err := c.Join(); err != nil {
+				panic(fmt.Sprintf("failed to add new cluster node: %v", err))
 			}
 		}

@ -965,6 +1058,15 @@ func (c *Cluster) Close() error {
 	return nil
 }

+func (c *Cluster) WaitForLeader() error {
+	for _, s := range c.Stores {
+		if err := s.WaitForLeader(5 * time.Second); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // Leader returns the store that is currently leader.
 func (c *Cluster) Leader() *Store {
 	for _, s := range c.Stores {
@ -987,3 +1089,44 @@ func MustTempFile() string {
 func mockHashPassword(password string) ([]byte, error) {
 	return []byte(password), nil
 }
+
+// assertRaftPeerNodes counts the number of nodes running with a local raft
+// database and asserts that the count is equal to n
+func assertRaftPeerNodes(t *testing.T, c *Cluster, n int) {
+	// Ensure we have the required number of raft nodes
+	raftCount := 0
+	for _, s := range c.Stores {
+		if _, err := os.Stat(filepath.Join(s.Path(), "raft.db")); err == nil {
+			raftCount += 1
+		}
+	}
+
+	if raftCount != n {
+		t.Errorf("raft nodes mismatch: got %v, exp %v", raftCount, n)
+	}
+}
+
+// assertDatabaseReplicated creates a new database named after each node and
+// then verifies that each node can see all the created databases from their
+// local meta data
+func assertDatabaseReplicated(t *testing.T, c *Cluster) {
+	// Add a database to each node.
+	for i, s := range c.Stores {
+		if di, err := s.CreateDatabase(fmt.Sprintf("db%d", i)); err != nil {
+			t.Fatal(err)
+		} else if di == nil {
+			t.Fatal("expected database")
+		}
+	}
+
+	// Verify that each store has all databases.
+	for i := 0; i < len(c.Stores); i++ {
+		for _, s := range c.Stores {
+			if di, err := s.Database(fmt.Sprintf("db%d", i)); err != nil {
+				t.Fatal(err)
+			} else if di == nil {
+				t.Fatal("expected database")
+			}
+		}
+	}
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/batcher.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/batcher.go
@ -9,6 +9,8 @@ import (
 // PointBatcher accepts Points and will emit a batch of those points when either
 // a) the batch reaches a certain size, or b) a certain time passes.
 type PointBatcher struct {
+	stats PointBatcherStats
+
 	size     int
 	duration time.Duration

@ -17,8 +19,6 @@ type PointBatcher struct {
 	out   chan []Point
 	flush chan struct{}

-	stats PointBatcherStats
-
 	wg *sync.WaitGroup
 }

--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/batcher_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/batcher_test.go
@ -1,21 +1,23 @@
-package tsdb
+package tsdb_test

 import (
 	"testing"
 	"time"
+
+	"github.com/influxdb/influxdb/tsdb"
 )

 // TestBatch_Size ensures that a batcher generates a batch when the size threshold is reached.
 func TestBatch_Size(t *testing.T) {
 	batchSize := 5
-	batcher := NewPointBatcher(batchSize, time.Hour)
+	batcher := tsdb.NewPointBatcher(batchSize, time.Hour)
 	if batcher == nil {
 		t.Fatal("failed to create batcher for size test")
 	}

 	batcher.Start()

-	var p Point
+	var p tsdb.Point
 	go func() {
 		for i := 0; i < batchSize; i++ {
 			batcher.In() <- p
@ -31,14 +33,14 @@ func TestBatch_Size(t *testing.T) {
 // TestBatch_Size ensures that a batcher generates a batch when the timeout triggers.
 func TestBatch_Timeout(t *testing.T) {
 	batchSize := 5
-	batcher := NewPointBatcher(batchSize+1, 100*time.Millisecond)
+	batcher := tsdb.NewPointBatcher(batchSize+1, 100*time.Millisecond)
 	if batcher == nil {
 		t.Fatal("failed to create batcher for timeout test")
 	}

 	batcher.Start()

-	var p Point
+	var p tsdb.Point
 	go func() {
 		for i := 0; i < batchSize; i++ {
 			batcher.In() <- p
@ -54,14 +56,14 @@ func TestBatch_Timeout(t *testing.T) {
 // TestBatch_Flush ensures that a batcher generates a batch when flushed
 func TestBatch_Flush(t *testing.T) {
 	batchSize := 2
-	batcher := NewPointBatcher(batchSize, time.Hour)
+	batcher := tsdb.NewPointBatcher(batchSize, time.Hour)
 	if batcher == nil {
 		t.Fatal("failed to create batcher for flush test")
 	}

 	batcher.Start()

-	var p Point
+	var p tsdb.Point
 	go func() {
 		batcher.In() <- p
 		batcher.Flush()
@ -76,15 +78,15 @@ func TestBatch_Flush(t *testing.T) {
 // TestBatch_MultipleBatches ensures that a batcher correctly processes multiple batches.
 func TestBatch_MultipleBatches(t *testing.T) {
 	batchSize := 2
-	batcher := NewPointBatcher(batchSize, 100*time.Millisecond)
+	batcher := tsdb.NewPointBatcher(batchSize, 100*time.Millisecond)
 	if batcher == nil {
 		t.Fatal("failed to create batcher for size test")
 	}

 	batcher.Start()

-	var p Point
-	var b []Point
+	var p tsdb.Point
+	var b []tsdb.Point

 	batcher.In() <- p
 	batcher.In() <- p
@ -102,7 +104,7 @@ func TestBatch_MultipleBatches(t *testing.T) {
 	checkPointBatcherStats(t, batcher, -1, 3, 1, 1)
 }

-func checkPointBatcherStats(t *testing.T, b *PointBatcher, batchTotal, pointTotal, sizeTotal, timeoutTotal int) {
+func checkPointBatcherStats(t *testing.T, b *tsdb.PointBatcher, batchTotal, pointTotal, sizeTotal, timeoutTotal int) {
 	stats := b.Stats()

 	if batchTotal != -1 && stats.BatchTotal != uint64(batchTotal) {
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/cursor.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/cursor.go
@ -0,0 +1,119 @@
+package tsdb
+
+import (
+	"bytes"
+	"container/heap"
+)
+
+// MultiCursor returns a single cursor that combines the results of all cursors in order.
+//
+// If the same key is returned from multiple cursors then the first cursor
+// specified will take precendence. A key will only be returned once from the
+// returned cursor.
+func MultiCursor(cursors ...Cursor) Cursor {
+	return &multiCursor{cursors: cursors}
+}
+
+// multiCursor represents a cursor that combines multiple cursors into one.
+type multiCursor struct {
+	cursors []Cursor
+	heap    cursorHeap
+	prev    []byte
+}
+
+// Seek moves the cursor to a given key.
+func (mc *multiCursor) Seek(seek []byte) (key, value []byte) {
+	// Initialize heap.
+	h := make(cursorHeap, 0, len(mc.cursors))
+	for i, c := range mc.cursors {
+		// Move cursor to position. Skip if it's empty.
+		k, v := c.Seek(seek)
+		if k == nil {
+			continue
+		}
+
+		// Append cursor to heap.
+		h = append(h, &cursorHeapItem{
+			key:      k,
+			value:    v,
+			cursor:   c,
+			priority: len(mc.cursors) - i,
+		})
+	}
+
+	heap.Init(&h)
+	mc.heap = h
+	mc.prev = nil
+
+	return mc.pop()
+}
+
+// Next returns the next key/value from the cursor.
+func (mc *multiCursor) Next() (key, value []byte) { return mc.pop() }
+
+// pop returns the next item from the heap.
+// Reads the next key/value from item's cursor and puts it back on the heap.
+func (mc *multiCursor) pop() (key, value []byte) {
+	// Read items until we have a key that doesn't match the previously read one.
+	// This is to perform deduplication when there's multiple items with the same key.
+	// The highest priority cursor will be read first and then remaining keys will be dropped.
+	for {
+		// Return nil if there are no more items left.
+		if len(mc.heap) == 0 {
+			return nil, nil
+		}
+
+		// Read the next item from the heap.
+		item := heap.Pop(&mc.heap).(*cursorHeapItem)
+
+		// Save the key/value for return.
+		key, value = item.key, item.value
+
+		// Read the next item from the cursor. Push back to heap if one exists.
+		if item.key, item.value = item.cursor.Next(); item.key != nil {
+			heap.Push(&mc.heap, item)
+		}
+
+		// Skip if this key matches the previously returned one.
+		if bytes.Equal(mc.prev, key) {
+			continue
+		}
+
+		mc.prev = key
+		return
+	}
+}
+
+// cursorHeap represents a heap of cursorHeapItems.
+type cursorHeap []*cursorHeapItem
+
+func (h cursorHeap) Len() int      { return len(h) }
+func (h cursorHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+func (h cursorHeap) Less(i, j int) bool {
+	if cmp := bytes.Compare(h[i].key, h[j].key); cmp == -1 {
+		return true
+	} else if cmp == 0 {
+		return h[i].priority > h[j].priority
+	}
+	return false
+}
+
+func (h *cursorHeap) Push(x interface{}) {
+	*h = append(*h, x.(*cursorHeapItem))
+}
+
+func (h *cursorHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	item := old[n-1]
+	*h = old[0 : n-1]
+	return item
+}
+
+// cursorHeapItem is something we manage in a priority queue.
+type cursorHeapItem struct {
+	key      []byte
+	value    []byte
+	cursor   Cursor
+	priority int
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/cursor_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/cursor_test.go
@ -0,0 +1,221 @@
+package tsdb_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"math/rand"
+	"reflect"
+	"sort"
+	"testing"
+	"testing/quick"
+
+	"github.com/influxdb/influxdb/tsdb"
+)
+
+// Ensure the multi-cursor can correctly iterate across a single subcursor.
+func TestMultiCursor_Single(t *testing.T) {
+	mc := tsdb.MultiCursor(
+		NewCursor([]CursorItem{
+			{Key: []byte{0x00}, Value: []byte{0x00}},
+			{Key: []byte{0x01}, Value: []byte{0x10}},
+			{Key: []byte{0x02}, Value: []byte{0x20}},
+		}),
+	)
+
+	if k, v := mc.Seek([]byte{0x00}); !bytes.Equal(k, []byte{0x00}) || !bytes.Equal(v, []byte{0x00}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x01}) || !bytes.Equal(v, []byte{0x10}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x02}) || !bytes.Equal(v, []byte{0x20}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); k != nil {
+		t.Fatalf("expected eof, got: %x / %x", k, v)
+	}
+}
+
+// Ensure the multi-cursor can correctly iterate across multiple non-overlapping subcursors.
+func TestMultiCursor_Multiple_NonOverlapping(t *testing.T) {
+	mc := tsdb.MultiCursor(
+		NewCursor([]CursorItem{
+			{Key: []byte{0x00}, Value: []byte{0x00}},
+			{Key: []byte{0x03}, Value: []byte{0x30}},
+			{Key: []byte{0x04}, Value: []byte{0x40}},
+		}),
+		NewCursor([]CursorItem{
+			{Key: []byte{0x01}, Value: []byte{0x10}},
+			{Key: []byte{0x02}, Value: []byte{0x20}},
+		}),
+	)
+
+	if k, v := mc.Seek([]byte{0x00}); !bytes.Equal(k, []byte{0x00}) || !bytes.Equal(v, []byte{0x00}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x01}) || !bytes.Equal(v, []byte{0x10}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x02}) || !bytes.Equal(v, []byte{0x20}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x03}) || !bytes.Equal(v, []byte{0x30}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x04}) || !bytes.Equal(v, []byte{0x40}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); k != nil {
+		t.Fatalf("expected eof, got: %x / %x", k, v)
+	}
+}
+
+// Ensure the multi-cursor can correctly iterate across multiple overlapping subcursors.
+func TestMultiCursor_Multiple_Overlapping(t *testing.T) {
+	mc := tsdb.MultiCursor(
+		NewCursor([]CursorItem{
+			{Key: []byte{0x00}, Value: []byte{0x00}},
+			{Key: []byte{0x03}, Value: []byte{0x03}},
+			{Key: []byte{0x04}, Value: []byte{0x04}},
+		}),
+		NewCursor([]CursorItem{
+			{Key: []byte{0x00}, Value: []byte{0xF0}},
+			{Key: []byte{0x02}, Value: []byte{0xF2}},
+			{Key: []byte{0x04}, Value: []byte{0xF4}},
+		}),
+	)
+
+	if k, v := mc.Seek([]byte{0x00}); !bytes.Equal(k, []byte{0x00}) || !bytes.Equal(v, []byte{0x00}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x02}) || !bytes.Equal(v, []byte{0xF2}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x03}) || !bytes.Equal(v, []byte{0x03}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); !bytes.Equal(k, []byte{0x04}) || !bytes.Equal(v, []byte{0x04}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = mc.Next(); k != nil {
+		t.Fatalf("expected eof, got: %x / %x", k, v)
+	}
+}
+
+// Ensure the multi-cursor can handle randomly generated data.
+func TestMultiCursor_Quick(t *testing.T) {
+	quick.Check(func(seek uint64, cursors []Cursor) bool {
+		var got, exp [][]byte
+		seek %= 100
+
+		// Merge all cursor data to determine expected output.
+		// First seen key overrides all other items with the same key.
+		m := make(map[string][]byte)
+		for _, c := range cursors {
+			for _, item := range c.items {
+				if bytes.Compare(item.Key, u64tob(seek)) == -1 {
+					continue
+				}
+				if _, ok := m[string(item.Key)]; ok {
+					continue
+				}
+				m[string(item.Key)] = item.Value
+			}
+		}
+
+		// Convert map back to single item list.
+		for k, v := range m {
+			exp = append(exp, append([]byte(k), v...))
+		}
+		sort.Sort(byteSlices(exp))
+
+		// Create multi-cursor and iterate over all items.
+		mc := tsdb.MultiCursor(tsdbCursorSlice(cursors)...)
+		for k, v := mc.Seek(u64tob(seek)); k != nil; k, v = mc.Next() {
+			got = append(got, append(k, v...))
+		}
+
+		// Verify results.
+		if !reflect.DeepEqual(got, exp) {
+			t.Fatalf("mismatch: seek=%d\n\ngot=%+v\n\nexp=%+v", seek, got, exp)
+		}
+
+		return true
+	}, nil)
+}
+
+// Cursor represents an in-memory test cursor.
+type Cursor struct {
+	items []CursorItem
+	index int
+}
+
+// NewCursor returns a new instance of Cursor.
+func NewCursor(items []CursorItem) *Cursor {
+	sort.Sort(CursorItems(items))
+	return &Cursor{items: items}
+}
+
+// Seek seeks to an item by key.
+func (c *Cursor) Seek(seek []byte) (key, value []byte) {
+	for c.index = 0; c.index < len(c.items); c.index++ {
+		if bytes.Compare(c.items[c.index].Key, seek) == -1 { // skip keys less than seek
+			continue
+		}
+		return c.items[c.index].Key, c.items[c.index].Value
+	}
+	return nil, nil
+}
+
+// Next returns the next key/value pair.
+func (c *Cursor) Next() (key, value []byte) {
+	if c.index >= len(c.items)-1 {
+		return nil, nil
+	}
+
+	c.index++
+	return c.items[c.index].Key, c.items[c.index].Value
+}
+
+// Generate returns a randomly generated cursor. Implements quick.Generator.
+func (c Cursor) Generate(rand *rand.Rand, size int) reflect.Value {
+	c.index = 0
+
+	c.items = make([]CursorItem, rand.Intn(size))
+	for i := range c.items {
+		value, _ := quick.Value(reflect.TypeOf([]byte(nil)), rand)
+
+		c.items[i] = CursorItem{
+			Key:   u64tob(uint64(rand.Intn(size))),
+			Value: value.Interface().([]byte),
+		}
+	}
+
+	// Sort items by key.
+	sort.Sort(CursorItems(c.items))
+
+	return reflect.ValueOf(c)
+}
+
+// tsdbCursorSlice converts a Cursor slice to a tsdb.Cursor slice.
+func tsdbCursorSlice(a []Cursor) []tsdb.Cursor {
+	var other []tsdb.Cursor
+	for i := range a {
+		other = append(other, &a[i])
+	}
+	return other
+}
+
+// CursorItem represents a key/value pair in a cursor.
+type CursorItem struct {
+	Key   []byte
+	Value []byte
+}
+
+type CursorItems []CursorItem
+
+func (a CursorItems) Len() int           { return len(a) }
+func (a CursorItems) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a CursorItems) Less(i, j int) bool { return bytes.Compare(a[i].Key, a[j].Key) == -1 }
+
+// byteSlices represents a sortable slice of byte slices.
+type byteSlices [][]byte
+
+func (a byteSlices) Len() int           { return len(a) }
+func (a byteSlices) Less(i, j int) bool { return bytes.Compare(a[i], a[j]) == -1 }
+func (a byteSlices) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+
+// u64tob converts a uint64 into an 8-byte slice.
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine.go
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/b1/b1.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/b1/b1.go
@ -0,0 +1,695 @@
+package b1
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"hash/fnv"
+	"io"
+	"log"
+	"os"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/boltdb/bolt"
+	"github.com/influxdb/influxdb/tsdb"
+)
+
+// Format is the file format name of this engine.
+const Format = "b1"
+
+func init() {
+	tsdb.RegisterEngine(Format, NewEngine)
+}
+
+// topLevelBucketN is the number of non-series buckets in the bolt db.
+const topLevelBucketN = 3
+
+var (
+	// ErrWALPartitionNotFound returns when flushing a partition that does not exist.
+	ErrWALPartitionNotFound = errors.New("wal partition not found")
+)
+
+// Ensure Engine implements the interface.
+var _ tsdb.Engine = &Engine{}
+
+// Engine represents a version 1 storage engine.
+type Engine struct {
+	mu sync.RWMutex
+
+	path string   // path to data file
+	db   *bolt.DB // underlying database
+
+	cache map[uint8]map[string][][]byte // values by <wal partition,series>
+
+	walSize    int           // approximate size of the WAL, in bytes
+	flush      chan struct{} // signals background flush
+	flushTimer *time.Timer   // signals time-based flush
+
+	// These coordinate closing and waiting for running goroutines.
+	wg      sync.WaitGroup
+	closing chan struct{}
+
+	// Used for out-of-band error messages.
+	logger *log.Logger
+
+	// The maximum size and time thresholds for flushing the WAL.
+	MaxWALSize             int
+	WALFlushInterval       time.Duration
+	WALPartitionFlushDelay time.Duration
+
+	// The writer used by the logger.
+	LogOutput io.Writer
+}
+
+// NewEngine returns a new instance of Engine.
+func NewEngine(path string, opt tsdb.EngineOptions) tsdb.Engine {
+	e := &Engine{
+		path:  path,
+		flush: make(chan struct{}, 1),
+
+		MaxWALSize:             opt.MaxWALSize,
+		WALFlushInterval:       opt.WALFlushInterval,
+		WALPartitionFlushDelay: opt.WALPartitionFlushDelay,
+
+		LogOutput: os.Stderr,
+	}
+
+	// Initialize all partitions of the cache.
+	e.cache = make(map[uint8]map[string][][]byte)
+	for i := uint8(0); i < WALPartitionN; i++ {
+		e.cache[i] = make(map[string][][]byte)
+	}
+
+	return e
+}
+
+// Path returns the path the engine was initialized with.
+func (e *Engine) Path() string { return e.path }
+
+// Open opens and initializes the engine.
+func (e *Engine) Open() error {
+	if err := func() error {
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		// Open underlying storage.
+		db, err := bolt.Open(e.path, 0666, &bolt.Options{Timeout: 1 * time.Second})
+		if err != nil {
+			return err
+		}
+		e.db = db
+
+		// Initialize data file.
+		if err := e.db.Update(func(tx *bolt.Tx) error {
+			_, _ = tx.CreateBucketIfNotExists([]byte("series"))
+			_, _ = tx.CreateBucketIfNotExists([]byte("fields"))
+			_, _ = tx.CreateBucketIfNotExists([]byte("wal"))
+
+			// Set file format, if not set yet.
+			b, _ := tx.CreateBucketIfNotExists([]byte("meta"))
+			if v := b.Get([]byte("format")); v == nil {
+				if err := b.Put([]byte("format"), []byte(Format)); err != nil {
+					return fmt.Errorf("set format: %s", err)
+				}
+			}
+
+			return nil
+		}); err != nil {
+			return fmt.Errorf("init: %s", err)
+		}
+
+		// Start flush interval timer.
+		e.flushTimer = time.NewTimer(e.WALFlushInterval)
+
+		// Initialize logger.
+		e.logger = log.New(e.LogOutput, "[b1] ", log.LstdFlags)
+
+		// Start background goroutines.
+		e.wg.Add(1)
+		e.closing = make(chan struct{})
+		go e.autoflusher(e.closing)
+
+		return nil
+	}(); err != nil {
+		e.close()
+		return err
+	}
+
+	// Flush on-disk WAL before we return to the caller.
+	if err := e.Flush(0); err != nil {
+		return fmt.Errorf("flush: %s", err)
+	}
+
+	return nil
+}
+
+func (e *Engine) Close() error {
+	e.mu.Lock()
+	err := e.close()
+	e.mu.Unlock()
+
+	// Wait for open goroutines to finish.
+	e.wg.Wait()
+	return err
+}
+
+func (e *Engine) close() error {
+	if e.db != nil {
+		e.db.Close()
+	}
+	if e.closing != nil {
+		close(e.closing)
+		e.closing = nil
+	}
+	return nil
+}
+
+// SetLogOutput sets the writer used for log output.
+// This must be set before opening the engine.
+func (e *Engine) SetLogOutput(w io.Writer) { e.LogOutput = w }
+
+// LoadMetadataIndex loads the shard metadata into memory.
+func (e *Engine) LoadMetadataIndex(index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error {
+	return e.db.View(func(tx *bolt.Tx) error {
+		// load measurement metadata
+		meta := tx.Bucket([]byte("fields"))
+		c := meta.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			m := index.CreateMeasurementIndexIfNotExists(string(k))
+			mf := &tsdb.MeasurementFields{}
+			if err := mf.UnmarshalBinary(v); err != nil {
+				return err
+			}
+			for name, _ := range mf.Fields {
+				m.SetFieldName(name)
+			}
+			mf.Codec = tsdb.NewFieldCodec(mf.Fields)
+			measurementFields[m.Name] = mf
+		}
+
+		// load series metadata
+		meta = tx.Bucket([]byte("series"))
+		c = meta.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			series := &tsdb.Series{}
+			if err := series.UnmarshalBinary(v); err != nil {
+				return err
+			}
+			index.CreateSeriesIndexIfNotExists(tsdb.MeasurementFromSeriesKey(string(k)), series)
+		}
+		return nil
+	})
+}
+
+// WritePoints will write the raw data points and any new metadata to the index in the shard
+func (e *Engine) WritePoints(points []tsdb.Point, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error {
+	// save to the underlying bolt instance
+	if err := e.db.Update(func(tx *bolt.Tx) error {
+		// save any new metadata
+		if len(seriesToCreate) > 0 {
+			b := tx.Bucket([]byte("series"))
+			for _, sc := range seriesToCreate {
+				data, err := sc.Series.MarshalBinary()
+				if err != nil {
+					return err
+				}
+				if err := b.Put([]byte(sc.Series.Key), data); err != nil {
+					return err
+				}
+			}
+		}
+		if len(measurementFieldsToSave) > 0 {
+			b := tx.Bucket([]byte("fields"))
+			for name, m := range measurementFieldsToSave {
+				data, err := m.MarshalBinary()
+				if err != nil {
+					return err
+				}
+				if err := b.Put([]byte(name), data); err != nil {
+					return err
+				}
+			}
+		}
+
+		// Write points to WAL bucket.
+		wal := tx.Bucket([]byte("wal"))
+		for _, p := range points {
+			// Retrieve partition bucket.
+			key := p.Key()
+			b, err := wal.CreateBucketIfNotExists([]byte{WALPartition(key)})
+			if err != nil {
+				return fmt.Errorf("create WAL partition bucket: %s", err)
+			}
+
+			// Generate an autoincrementing index for the WAL partition.
+			id, _ := b.NextSequence()
+
+			// Append points sequentially to the WAL bucket.
+			v := marshalWALEntry(key, p.UnixNano(), p.Data())
+			if err := b.Put(u64tob(id), v); err != nil {
+				return fmt.Errorf("put wal: %s", err)
+			}
+		}
+
+		return nil
+	}); err != nil {
+		return err
+	}
+
+	// If successful then save points to in-memory cache.
+	if err := func() error {
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		// tracks which in-memory caches need to be resorted
+		resorts := map[uint8]map[string]struct{}{}
+
+		for _, p := range points {
+			// Generate in-memory cache entry of <timestamp,data>.
+			key, data := p.Key(), p.Data()
+			v := make([]byte, 8+len(data))
+			binary.BigEndian.PutUint64(v[0:8], uint64(p.UnixNano()))
+			copy(v[8:], data)
+
+			// Determine if we are appending.
+			partitionID := WALPartition(key)
+			a := e.cache[partitionID][string(key)]
+			appending := (len(a) == 0 || bytes.Compare(a[len(a)-1], v) == -1)
+
+			// Append to cache list.
+			a = append(a, v)
+
+			// If not appending, keep track of cache lists that need to be resorted.
+			if !appending {
+				series := resorts[partitionID]
+				if series == nil {
+					series = map[string]struct{}{}
+					resorts[partitionID] = series
+				}
+				series[string(key)] = struct{}{}
+			}
+
+			e.cache[partitionID][string(key)] = a
+
+			// Calculate estimated WAL size.
+			e.walSize += len(key) + len(v)
+		}
+
+		// Sort by timestamp if not appending.
+		for partitionID, cache := range resorts {
+			for key, _ := range cache {
+				sort.Sort(byteSlices(e.cache[partitionID][key]))
+			}
+		}
+
+		// Check for flush threshold.
+		e.triggerAutoFlush()
+
+		return nil
+	}(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// DeleteSeries deletes the series from the engine.
+func (e *Engine) DeleteSeries(keys []string) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if err := e.db.Update(func(tx *bolt.Tx) error {
+		b := tx.Bucket([]byte("series"))
+		for _, k := range keys {
+			if err := b.Delete([]byte(k)); err != nil {
+				return err
+			}
+			if err := tx.DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
+				return err
+			}
+			delete(e.cache[WALPartition([]byte(k))], k)
+		}
+		return nil
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// DeleteMeasurement deletes a measurement and all related series.
+func (e *Engine) DeleteMeasurement(name string, seriesKeys []string) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if err := e.db.Update(func(tx *bolt.Tx) error {
+		bm := tx.Bucket([]byte("fields"))
+		if err := bm.Delete([]byte(name)); err != nil {
+			return err
+		}
+		b := tx.Bucket([]byte("series"))
+		for _, k := range seriesKeys {
+			if err := b.Delete([]byte(k)); err != nil {
+				return err
+			}
+			if err := tx.DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
+				return err
+			}
+			delete(e.cache[WALPartition([]byte(k))], k)
+		}
+
+		return nil
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// Flush writes all points from the write ahead log to the index.
+func (e *Engine) Flush(partitionFlushDelay time.Duration) error {
+	// Retrieve a list of WAL buckets.
+	var partitionIDs []uint8
+	if err := e.db.View(func(tx *bolt.Tx) error {
+		return tx.Bucket([]byte("wal")).ForEach(func(key, _ []byte) error {
+			partitionIDs = append(partitionIDs, uint8(key[0]))
+			return nil
+		})
+	}); err != nil {
+		return err
+	}
+
+	// Continue flushing until there are no more partition buckets.
+	for _, partitionID := range partitionIDs {
+		if err := e.FlushPartition(partitionID); err != nil {
+			return fmt.Errorf("flush partition: id=%d, err=%s", partitionID, err)
+		}
+
+		// Wait momentarily so other threads can process.
+		time.Sleep(partitionFlushDelay)
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// Reset WAL size.
+	e.walSize = 0
+
+	// Reset the timer.
+	e.flushTimer.Reset(e.WALFlushInterval)
+
+	return nil
+}
+
+// FlushPartition flushes a single WAL partition.
+func (e *Engine) FlushPartition(partitionID uint8) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	startTime := time.Now()
+
+	var pointN int
+	if err := e.db.Update(func(tx *bolt.Tx) error {
+		// Retrieve partition bucket. Exit if it doesn't exist.
+		pb := tx.Bucket([]byte("wal")).Bucket([]byte{byte(partitionID)})
+		if pb == nil {
+			return ErrWALPartitionNotFound
+		}
+
+		// Iterate over keys in the WAL partition bucket.
+		c := pb.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			key, timestamp, data := unmarshalWALEntry(v)
+
+			// Create bucket for entry.
+			b, err := tx.CreateBucketIfNotExists(key)
+			if err != nil {
+				return fmt.Errorf("create bucket: %s", err)
+			}
+
+			// Write point to bucket.
+			if err := b.Put(u64tob(uint64(timestamp)), data); err != nil {
+				return fmt.Errorf("put: %s", err)
+			}
+
+			// Remove entry in the WAL.
+			if err := c.Delete(); err != nil {
+				return fmt.Errorf("delete: %s", err)
+			}
+
+			pointN++
+		}
+
+		return nil
+	}); err != nil {
+		return err
+	}
+
+	// Reset cache.
+	e.cache[partitionID] = make(map[string][][]byte)
+
+	if pointN > 0 {
+		e.logger.Printf("flush %d points in %.3fs", pointN, time.Since(startTime).Seconds())
+	}
+
+	return nil
+}
+
+// autoflusher waits for notification of a flush and kicks it off in the background.
+// This method runs in a separate goroutine.
+func (e *Engine) autoflusher(closing chan struct{}) {
+	defer e.wg.Done()
+
+	for {
+		// Wait for close or flush signal.
+		select {
+		case <-closing:
+			return
+		case <-e.flushTimer.C:
+			if err := e.Flush(e.WALPartitionFlushDelay); err != nil {
+				e.logger.Printf("flush error: %s", err)
+			}
+		case <-e.flush:
+			if err := e.Flush(e.WALPartitionFlushDelay); err != nil {
+				e.logger.Printf("flush error: %s", err)
+			}
+		}
+	}
+}
+
+// triggerAutoFlush signals that a flush should occur if the size is above the threshold.
+// This function must be called within the context of a lock.
+func (e *Engine) triggerAutoFlush() {
+	// Ignore if we haven't reached the threshold.
+	if e.walSize < e.MaxWALSize {
+		return
+	}
+
+	// Otherwise send a non-blocking signal.
+	select {
+	case e.flush <- struct{}{}:
+	default:
+	}
+}
+
+// SeriesCount returns the number of series buckets on the shard.
+// This does not include a count from the WAL.
+func (e *Engine) SeriesCount() (n int, err error) {
+	err = e.db.View(func(tx *bolt.Tx) error {
+		return tx.ForEach(func(_ []byte, _ *bolt.Bucket) error {
+			n++
+			return nil
+		})
+	})
+
+	// Remove top-level buckets.
+	n -= topLevelBucketN
+
+	return
+}
+
+// Begin starts a new transaction on the engine.
+func (e *Engine) Begin(writable bool) (tsdb.Tx, error) {
+	tx, err := e.db.Begin(writable)
+	if err != nil {
+		return nil, err
+	}
+	return &Tx{Tx: tx, engine: e}, nil
+}
+
+// DB returns the underlying Bolt database.
+func (e *Engine) DB() *bolt.DB { return e.db }
+
+// Tx represents a transaction.
+type Tx struct {
+	*bolt.Tx
+	engine *Engine
+}
+
+// Cursor returns an iterator for a key.
+func (tx *Tx) Cursor(key string) tsdb.Cursor {
+	// Retrieve key bucket.
+	b := tx.Bucket([]byte(key))
+
+	tx.engine.mu.RLock()
+	defer tx.engine.mu.RUnlock()
+
+	// Ignore if there is no bucket or points in the cache.
+	partitionID := WALPartition([]byte(key))
+	if b == nil && len(tx.engine.cache[partitionID][key]) == 0 {
+		return nil
+	}
+
+	// Retrieve a copy of the in-cache points for the key.
+	cache := make([][]byte, len(tx.engine.cache[partitionID][key]))
+	copy(cache, tx.engine.cache[partitionID][key])
+
+	// Build a cursor that merges the bucket and cache together.
+	cur := &Cursor{cache: cache}
+	if b != nil {
+		cur.cursor = b.Cursor()
+	}
+	return cur
+}
+
+// Cursor provides ordered iteration across a series.
+type Cursor struct {
+	// Bolt cursor and readahead buffer.
+	cursor *bolt.Cursor
+	buf    struct {
+		key, value []byte
+	}
+
+	// Cache and current cache index.
+	cache [][]byte
+	index int
+
+	// Previously read key.
+	prev []byte
+}
+
+// Seek moves the cursor to a position and returns the closest key/value pair.
+func (c *Cursor) Seek(seek []byte) (key, value []byte) {
+	// Seek bolt cursor.
+	if c.cursor != nil {
+		c.buf.key, c.buf.value = c.cursor.Seek(seek)
+	}
+
+	// Seek cache index.
+	c.index = sort.Search(len(c.cache), func(i int) bool {
+		return bytes.Compare(c.cache[i][0:8], seek) != -1
+	})
+
+	c.prev = nil
+	return c.read()
+}
+
+// Next returns the next key/value pair from the cursor.
+func (c *Cursor) Next() (key, value []byte) {
+	return c.read()
+}
+
+// read returns the next key/value in the cursor buffer or cache.
+func (c *Cursor) read() (key, value []byte) {
+	// Continue skipping ahead through duplicate keys in the cache list.
+	for {
+		// Read next value from the cursor.
+		if c.buf.key == nil && c.cursor != nil {
+			c.buf.key, c.buf.value = c.cursor.Next()
+		}
+
+		// Read from the buffer or cache, which ever is lower.
+		if c.buf.key != nil && (c.index >= len(c.cache) || bytes.Compare(c.buf.key, c.cache[c.index][0:8]) == -1) {
+			key, value = c.buf.key, c.buf.value
+			c.buf.key, c.buf.value = nil, nil
+		} else if c.index < len(c.cache) {
+			key, value = c.cache[c.index][0:8], c.cache[c.index][8:]
+			c.index++
+		} else {
+			key, value = nil, nil
+		}
+
+		// Exit loop if we're at the end of the cache or the next key is different.
+		if key == nil || !bytes.Equal(key, c.prev) {
+			break
+		}
+	}
+
+	c.prev = key
+	return
+}
+
+// WALPartitionN is the number of partitions in the write ahead log.
+const WALPartitionN = 8
+
+// WALPartition returns the partition number that key belongs to.
+func WALPartition(key []byte) uint8 {
+	h := fnv.New64a()
+	h.Write(key)
+	return uint8(h.Sum64() % WALPartitionN)
+}
+
+// marshalWALEntry encodes point data into a single byte slice.
+//
+// The format of the byte slice is:
+//
+//     uint64 timestamp
+//     uint32 key length
+//     []byte key
+//     []byte data
+//
+func marshalWALEntry(key []byte, timestamp int64, data []byte) []byte {
+	v := make([]byte, 8+4, 8+4+len(key)+len(data))
+	binary.BigEndian.PutUint64(v[0:8], uint64(timestamp))
+	binary.BigEndian.PutUint32(v[8:12], uint32(len(key)))
+	v = append(v, key...)
+	v = append(v, data...)
+	return v
+}
+
+// unmarshalWALEntry decodes a WAL entry into it's separate parts.
+// Returned byte slices point to the original slice.
+func unmarshalWALEntry(v []byte) (key []byte, timestamp int64, data []byte) {
+	keyLen := binary.BigEndian.Uint32(v[8:12])
+	key = v[12 : 12+keyLen]
+	timestamp = int64(binary.BigEndian.Uint64(v[0:8]))
+	data = v[12+keyLen:]
+	return
+}
+
+// marshalCacheEntry encodes the timestamp and data to a single byte slice.
+//
+// The format of the byte slice is:
+//
+//     uint64 timestamp
+//     []byte data
+//
+func marshalCacheEntry(timestamp int64, data []byte) []byte {
+	buf := make([]byte, 8, 8+len(data))
+	binary.BigEndian.PutUint64(buf[0:8], uint64(timestamp))
+	return append(buf, data...)
+}
+
+// unmarshalCacheEntry returns the timestamp and data from an encoded byte slice.
+func unmarshalCacheEntry(buf []byte) (timestamp int64, data []byte) {
+	timestamp = int64(binary.BigEndian.Uint64(buf[0:8]))
+	data = buf[8:]
+	return
+}
+
+// u64tob converts a uint64 into an 8-byte slice.
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+// byteSlices represents a sortable slice of byte slices.
+type byteSlices [][]byte
+
+func (a byteSlices) Len() int           { return len(a) }
+func (a byteSlices) Less(i, j int) bool { return bytes.Compare(a[i], a[j]) == -1 }
+func (a byteSlices) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/b1/b1_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/b1/b1_test.go
@ -0,0 +1,134 @@
+package b1_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io/ioutil"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/influxdb/influxdb/influxql"
+	"github.com/influxdb/influxdb/tsdb"
+	"github.com/influxdb/influxdb/tsdb/engine/b1"
+)
+
+// Ensure points can be written to the engine and queried.
+func TestEngine_WritePoints(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Create metadata.
+	mf := &tsdb.MeasurementFields{Fields: make(map[string]*tsdb.Field)}
+	mf.CreateFieldIfNotExists("value", influxql.Float)
+	seriesToCreate := []*tsdb.SeriesCreate{
+		{Series: &tsdb.Series{Key: string(tsdb.MakeKey([]byte("temperature"), nil))}},
+	}
+
+	// Parse point.
+	points, err := tsdb.ParsePointsWithPrecision([]byte("temperature value=100 1434059627"), time.Now().UTC(), "s")
+	if err != nil {
+		t.Fatal(err)
+	} else if data, err := mf.Codec.EncodeFields(points[0].Fields()); err != nil {
+		t.Fatal(err)
+	} else {
+		points[0].SetData(data)
+	}
+
+	// Write original value.
+	if err := e.WritePoints(points, map[string]*tsdb.MeasurementFields{"temperature": mf}, seriesToCreate); err != nil {
+		t.Fatal(err)
+	}
+
+	// Flush to disk.
+	if err := e.Flush(0); err != nil {
+		t.Fatal(err)
+	}
+
+	// Parse new point.
+	points, err = tsdb.ParsePointsWithPrecision([]byte("temperature value=200 1434059627"), time.Now().UTC(), "s")
+	if err != nil {
+		t.Fatal(err)
+	} else if data, err := mf.Codec.EncodeFields(points[0].Fields()); err != nil {
+		t.Fatal(err)
+	} else {
+		points[0].SetData(data)
+	}
+
+	// Update existing value.
+	if err := e.WritePoints(points, nil, nil); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure only the updated value is read.
+	tx := e.MustBegin(false)
+	defer tx.Rollback()
+
+	c := tx.Cursor("temperature")
+	if k, v := c.Seek([]byte{0}); !bytes.Equal(k, u64tob(uint64(time.Unix(1434059627, 0).UnixNano()))) {
+		t.Fatalf("unexpected key: %#v", k)
+	} else if m, err := mf.Codec.DecodeFieldsWithNames(v); err != nil {
+		t.Fatal(err)
+	} else if m["value"] != float64(200) {
+		t.Errorf("unexpected value: %#v", m)
+	}
+
+	if k, v := c.Next(); k != nil {
+		t.Fatalf("unexpected key/value: %#v / %#v", k, v)
+	}
+}
+
+// Engine represents a test wrapper for b1.Engine.
+type Engine struct {
+	*b1.Engine
+}
+
+// NewEngine returns a new instance of Engine.
+func NewEngine(opt tsdb.EngineOptions) *Engine {
+	// Generate temporary file.
+	f, _ := ioutil.TempFile("", "b1-")
+	f.Close()
+	os.Remove(f.Name())
+
+	return &Engine{
+		Engine: b1.NewEngine(f.Name(), opt).(*b1.Engine),
+	}
+}
+
+// OpenEngine returns an opened instance of Engine. Panic on error.
+func OpenEngine(opt tsdb.EngineOptions) *Engine {
+	e := NewEngine(opt)
+	if err := e.Open(); err != nil {
+		panic(err)
+	}
+	return e
+}
+
+// OpenDefaultEngine returns an open Engine with default options.
+func OpenDefaultEngine() *Engine { return OpenEngine(tsdb.NewEngineOptions()) }
+
+// Close closes the engine and removes all data.
+func (e *Engine) Close() error {
+	e.Engine.Close()
+	os.RemoveAll(e.Path())
+	return nil
+}
+
+// MustBegin returns a new tranaction. Panic on error.
+func (e *Engine) MustBegin(writable bool) tsdb.Tx {
+	tx, err := e.Begin(writable)
+	if err != nil {
+		panic(err)
+	}
+	return tx
+}
+
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+func btou64(b []byte) uint64 {
+	return binary.BigEndian.Uint64(b)
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/bz1/bz1.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/bz1/bz1.go
@ -0,0 +1,627 @@
+package bz1
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/boltdb/bolt"
+	"github.com/golang/snappy"
+	"github.com/influxdb/influxdb/tsdb"
+)
+
+var (
+	// ErrSeriesExists is returned when writing points to an existing series.
+	ErrSeriesExists = errors.New("series exists")
+)
+
+// Format is the file format name of this engine.
+const Format = "bz1"
+
+func init() {
+	tsdb.RegisterEngine(Format, NewEngine)
+}
+
+const (
+	// DefaultBlockSize is the default size of uncompressed points blocks.
+	DefaultBlockSize = 32 * 1024 // 32KB
+)
+
+// Ensure Engine implements the interface.
+var _ tsdb.Engine = &Engine{}
+
+// Engine represents a storage engine with compressed blocks.
+type Engine struct {
+	mu   sync.Mutex
+	path string
+	db   *bolt.DB
+
+	// Write-ahead log storage.
+	PointsWriter interface {
+		WritePoints(points []tsdb.Point) error
+	}
+
+	// Size of uncompressed points to write to a block.
+	BlockSize int
+}
+
+// NewEngine returns a new instance of Engine.
+func NewEngine(path string, opt tsdb.EngineOptions) tsdb.Engine {
+	return &Engine{
+		path: path,
+
+		BlockSize: DefaultBlockSize,
+	}
+}
+
+// Path returns the path the engine was opened with.
+func (e *Engine) Path() string { return e.path }
+
+// Open opens and initializes the engine.
+func (e *Engine) Open() error {
+	if err := func() error {
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		// Open underlying storage.
+		db, err := bolt.Open(e.path, 0666, &bolt.Options{Timeout: 1 * time.Second})
+		if err != nil {
+			return err
+		}
+		e.db = db
+
+		// Initialize data file.
+		if err := e.db.Update(func(tx *bolt.Tx) error {
+			_, _ = tx.CreateBucketIfNotExists([]byte("series"))
+			_, _ = tx.CreateBucketIfNotExists([]byte("fields"))
+			_, _ = tx.CreateBucketIfNotExists([]byte("points"))
+
+			// Set file format, if not set yet.
+			b, _ := tx.CreateBucketIfNotExists([]byte("meta"))
+			if v := b.Get([]byte("format")); v == nil {
+				if err := b.Put([]byte("format"), []byte(Format)); err != nil {
+					return fmt.Errorf("set format: %s", err)
+				}
+			}
+
+			return nil
+		}); err != nil {
+			return fmt.Errorf("init: %s", err)
+		}
+
+		return nil
+	}(); err != nil {
+		e.close()
+		return err
+	}
+	return nil
+}
+
+// Close closes the engine.
+func (e *Engine) Close() error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.close()
+}
+
+func (e *Engine) close() error {
+	if e.db != nil {
+		return e.db.Close()
+	}
+	return nil
+}
+
+// SetLogOutput is a no-op.
+func (e *Engine) SetLogOutput(w io.Writer) {}
+
+// LoadMetadataIndex loads the shard metadata into memory.
+func (e *Engine) LoadMetadataIndex(index *tsdb.DatabaseIndex, measurementFields map[string]*tsdb.MeasurementFields) error {
+	return e.db.View(func(tx *bolt.Tx) error {
+		// Load measurement metadata
+		meta := tx.Bucket([]byte("fields"))
+		c := meta.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			m := index.CreateMeasurementIndexIfNotExists(string(k))
+			mf := &tsdb.MeasurementFields{}
+			if err := mf.UnmarshalBinary(v); err != nil {
+				return err
+			}
+			for name, _ := range mf.Fields {
+				m.SetFieldName(name)
+			}
+			mf.Codec = tsdb.NewFieldCodec(mf.Fields)
+			measurementFields[m.Name] = mf
+		}
+
+		// Load series metadata
+		meta = tx.Bucket([]byte("series"))
+		c = meta.Cursor()
+		for k, v := c.First(); k != nil; k, v = c.Next() {
+			series := &tsdb.Series{}
+			if err := series.UnmarshalBinary(v); err != nil {
+				return err
+			}
+			index.CreateSeriesIndexIfNotExists(tsdb.MeasurementFromSeriesKey(string(k)), series)
+		}
+		return nil
+	})
+}
+
+// WritePoints writes metadata and point data into the engine.
+// Returns an error if new points are added to an existing key.
+func (e *Engine) WritePoints(points []tsdb.Point, measurementFieldsToSave map[string]*tsdb.MeasurementFields, seriesToCreate []*tsdb.SeriesCreate) error {
+	// Write series & field metadata.
+	if err := e.db.Update(func(tx *bolt.Tx) error {
+		if err := e.writeSeries(tx, seriesToCreate); err != nil {
+			return fmt.Errorf("write series: %s", err)
+		}
+		if err := e.writeFields(tx, measurementFieldsToSave); err != nil {
+			return fmt.Errorf("write fields: %s", err)
+		}
+
+		return nil
+	}); err != nil {
+		return err
+	}
+
+	// Write points to the WAL.
+	if err := e.PointsWriter.WritePoints(points); err != nil {
+		return fmt.Errorf("write points: %s", err)
+	}
+
+	return nil
+}
+
+// writeSeries writes a list of series to the metadata.
+func (e *Engine) writeSeries(tx *bolt.Tx, a []*tsdb.SeriesCreate) error {
+	// Ignore if there are no series.
+	if len(a) == 0 {
+		return nil
+	}
+
+	// Marshal and insert each series into the metadata.
+	b := tx.Bucket([]byte("series"))
+	for _, sc := range a {
+		// Marshal series into bytes.
+		data, err := sc.Series.MarshalBinary()
+		if err != nil {
+			return fmt.Errorf("marshal series: %s", err)
+		}
+
+		// Insert marshaled data into appropriate key.
+		if err := b.Put([]byte(sc.Series.Key), data); err != nil {
+			return fmt.Errorf("put: %s", err)
+		}
+	}
+
+	return nil
+}
+
+// writeFields writes a list of measurement fields to the metadata.
+func (e *Engine) writeFields(tx *bolt.Tx, m map[string]*tsdb.MeasurementFields) error {
+	// Ignore if there are no fields to save.
+	if len(m) == 0 {
+		return nil
+	}
+
+	// Persist each measurement field in the map.
+	b := tx.Bucket([]byte("fields"))
+	for k, f := range m {
+		// Marshal field into bytes.
+		data, err := f.MarshalBinary()
+		if err != nil {
+			return fmt.Errorf("marshal measurement field: %s", err)
+		}
+
+		// Insert marshaled data into key.
+		if err := b.Put([]byte(k), data); err != nil {
+			return fmt.Errorf("put: %s", err)
+		}
+	}
+
+	return nil
+}
+
+// WriteIndex writes marshaled points to the engine's underlying index.
+func (e *Engine) WriteIndex(pointsByKey map[string][][]byte) error {
+	return e.db.Update(func(tx *bolt.Tx) error {
+		for key, values := range pointsByKey {
+			if err := e.writeIndex(tx, key, values); err != nil {
+				return fmt.Errorf("write: key=%x, err=%s", key, err)
+			}
+		}
+		return nil
+	})
+}
+
+// writeIndex writes a set of points for a single key.
+func (e *Engine) writeIndex(tx *bolt.Tx, key string, a [][]byte) error {
+	// Ignore if there are no points.
+	if len(a) == 0 {
+		return nil
+	}
+
+	// Create or retrieve series bucket.
+	bkt, err := tx.Bucket([]byte("points")).CreateBucketIfNotExists([]byte(key))
+	if err != nil {
+		return fmt.Errorf("create series bucket: %s", err)
+	}
+	c := bkt.Cursor()
+
+	// Ensure the slice is sorted before retrieving the time range.
+	a = DedupeEntries(a)
+	sort.Sort(byteSlices(a))
+
+	// Determine time range of new data.
+	tmin, tmax := int64(btou64(a[0][0:8])), int64(btou64(a[len(a)-1][0:8]))
+
+	// If tmin is after the last block then append new blocks.
+	//
+	// This is the optimized fast path. Otherwise we need to merge the points
+	// with existing blocks on disk and rewrite all the blocks for that range.
+	if k, v := c.Last(); k == nil || int64(btou64(v[0:8])) < tmin {
+		if err := e.writeBlocks(bkt, a); err != nil {
+			return fmt.Errorf("append blocks: %s", err)
+		}
+	}
+
+	// Generate map of inserted keys.
+	m := make(map[int64]struct{})
+	for _, b := range a {
+		m[int64(btou64(b[0:8]))] = struct{}{}
+	}
+
+	// If time range overlaps existing blocks then unpack full range and reinsert.
+	var existing [][]byte
+	for k, v := c.First(); k != nil; k, v = c.Next() {
+		// Determine block range.
+		bmin, bmax := int64(btou64(k)), int64(btou64(v[0:8]))
+
+		// Skip over all blocks before the time range.
+		// Exit once we reach a block that is beyond our time range.
+		if bmax < tmin {
+			continue
+		} else if bmin > tmax {
+			break
+		}
+
+		// Decode block.
+		buf, err := snappy.Decode(nil, v[8:])
+		if err != nil {
+			return fmt.Errorf("decode block: %s", err)
+		}
+
+		// Copy out any entries that aren't being overwritten.
+		for _, entry := range SplitEntries(buf) {
+			if _, ok := m[int64(btou64(entry[0:8]))]; !ok {
+				existing = append(existing, entry)
+			}
+		}
+
+		// Delete block in database.
+		c.Delete()
+	}
+
+	// Merge entries before rewriting.
+	a = append(existing, a...)
+	sort.Sort(byteSlices(a))
+
+	// Rewrite points to new blocks.
+	if err := e.writeBlocks(bkt, a); err != nil {
+		return fmt.Errorf("rewrite blocks: %s", err)
+	}
+
+	return nil
+}
+
+// writeBlocks writes point data to the bucket in blocks.
+func (e *Engine) writeBlocks(bkt *bolt.Bucket, a [][]byte) error {
+	var block []byte
+
+	// Dedupe points by key.
+	a = DedupeEntries(a)
+
+	// Group points into blocks by size.
+	tmin, tmax := int64(math.MaxInt64), int64(math.MinInt64)
+	for i, p := range a {
+		// Update block time range.
+		timestamp := int64(btou64(p[0:8]))
+		if timestamp < tmin {
+			tmin = timestamp
+		}
+		if timestamp > tmax {
+			tmax = timestamp
+		}
+
+		// Append point to the end of the block.
+		block = append(block, p...)
+
+		// If the block is larger than the target block size or this is the
+		// last point then flush the block to the bucket.
+		if len(block) >= e.BlockSize || i == len(a)-1 {
+			// Encode block in the following format:
+			//   tmax int64
+			//   data []byte (snappy compressed)
+			value := append(u64tob(uint64(tmax)), snappy.Encode(nil, block)...)
+
+			// Write block to the bucket.
+			if err := bkt.Put(u64tob(uint64(tmin)), value); err != nil {
+				return fmt.Errorf("put: ts=%d-%d, err=%s", tmin, tmax, err)
+			}
+
+			// Reset the block & time range.
+			block = nil
+			tmin, tmax = int64(math.MaxInt64), int64(math.MinInt64)
+		}
+	}
+
+	return nil
+}
+
+// DeleteSeries deletes the series from the engine.
+func (e *Engine) DeleteSeries(keys []string) error {
+	return e.db.Update(func(tx *bolt.Tx) error {
+		for _, k := range keys {
+			if err := tx.Bucket([]byte("series")).Delete([]byte(k)); err != nil {
+				return fmt.Errorf("delete series metadata: %s", err)
+			}
+			if err := tx.Bucket([]byte("points")).DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
+				return fmt.Errorf("delete series data: %s", err)
+			}
+		}
+		return nil
+	})
+}
+
+// DeleteMeasurement deletes a measurement and all related series.
+func (e *Engine) DeleteMeasurement(name string, seriesKeys []string) error {
+	return e.db.Update(func(tx *bolt.Tx) error {
+		if err := tx.Bucket([]byte("fields")).Delete([]byte(name)); err != nil {
+			return err
+		}
+
+		for _, k := range seriesKeys {
+			if err := tx.Bucket([]byte("series")).Delete([]byte(k)); err != nil {
+				return fmt.Errorf("delete series metadata: %s", err)
+			}
+			if err := tx.Bucket([]byte("points")).DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
+				return fmt.Errorf("delete series data: %s", err)
+			}
+		}
+
+		return nil
+	})
+}
+
+// SeriesCount returns the number of series buckets on the shard.
+func (e *Engine) SeriesCount() (n int, err error) {
+	err = e.db.View(func(tx *bolt.Tx) error {
+		c := tx.Bucket([]byte("points")).Cursor()
+		for k, _ := c.First(); k != nil; k, _ = c.Next() {
+			n++
+		}
+		return nil
+	})
+	return
+}
+
+// Begin starts a new transaction on the engine.
+func (e *Engine) Begin(writable bool) (tsdb.Tx, error) {
+	tx, err := e.db.Begin(writable)
+	if err != nil {
+		return nil, err
+	}
+	return &Tx{Tx: tx, engine: e}, nil
+}
+
+// Stats returns internal statistics for the engine.
+func (e *Engine) Stats() (stats Stats, err error) {
+	err = e.db.View(func(tx *bolt.Tx) error {
+		stats.Size = tx.Size()
+		return nil
+	})
+	return stats, err
+}
+
+// Stats represents internal engine statistics.
+type Stats struct {
+	Size int64 // BoltDB data size
+}
+
+// Tx represents a transaction.
+type Tx struct {
+	*bolt.Tx
+	engine *Engine
+}
+
+// Cursor returns an iterator for a key.
+func (tx *Tx) Cursor(key string) tsdb.Cursor {
+	// Retrieve points bucket. Ignore if there is no bucket.
+	b := tx.Bucket([]byte("points")).Bucket([]byte(key))
+	if b == nil {
+		return nil
+	}
+	return &Cursor{
+		cursor: b.Cursor(),
+		buf:    make([]byte, DefaultBlockSize),
+	}
+}
+
+// Cursor provides ordered iteration across a series.
+type Cursor struct {
+	cursor *bolt.Cursor
+	buf    []byte // uncompressed buffer
+	off    int    // buffer offset
+}
+
+// Seek moves the cursor to a position and returns the closest key/value pair.
+func (c *Cursor) Seek(seek []byte) (key, value []byte) {
+	// Move cursor to appropriate block and set to buffer.
+	_, v := c.cursor.Seek(seek)
+	c.setBuf(v)
+
+	// Read current block up to seek position.
+	c.seekBuf(seek)
+
+	// Return current entry.
+	return c.read()
+}
+
+// seekBuf moves the cursor to a position within the current buffer.
+func (c *Cursor) seekBuf(seek []byte) (key, value []byte) {
+	for {
+		// Slice off the current entry.
+		buf := c.buf[c.off:]
+
+		// Exit if current entry's timestamp is on or after the seek.
+		if len(buf) == 0 || bytes.Compare(buf[0:8], seek) != -1 {
+			return
+		}
+
+		// Otherwise skip ahead to the next entry.
+		c.off += entryHeaderSize + entryDataSize(buf)
+	}
+}
+
+// Next returns the next key/value pair from the cursor.
+func (c *Cursor) Next() (key, value []byte) {
+	// Ignore if there is no buffer.
+	if len(c.buf) == 0 {
+		return nil, nil
+	}
+
+	// Move forward to next entry.
+	c.off += entryHeaderSize + entryDataSize(c.buf[c.off:])
+
+	// If no items left then read first item from next block.
+	if c.off >= len(c.buf) {
+		_, v := c.cursor.Next()
+		c.setBuf(v)
+	}
+
+	return c.read()
+}
+
+// setBuf saves a compressed block to the buffer.
+func (c *Cursor) setBuf(block []byte) {
+	// Clear if the block is empty.
+	if len(block) == 0 {
+		c.buf, c.off = c.buf[0:0], 0
+		return
+	}
+
+	// Otherwise decode block into buffer.
+	// Skip over the first 8 bytes since they are the max timestamp.
+	buf, err := snappy.Decode(nil, block[8:])
+	if err != nil {
+		c.buf = c.buf[0:0]
+		log.Printf("block decode error: %s", err)
+	}
+	c.buf, c.off = buf, 0
+}
+
+// read reads the current key and value from the current block.
+func (c *Cursor) read() (key, value []byte) {
+	// Return nil if the offset is at the end of the buffer.
+	if c.off >= len(c.buf) {
+		return nil, nil
+	}
+
+	// Otherwise read the current entry.
+	buf := c.buf[c.off:]
+	dataSize := entryDataSize(buf)
+	return buf[0:8], buf[entryHeaderSize : entryHeaderSize+dataSize]
+}
+
+// MarshalEntry encodes point data into a single byte slice.
+//
+// The format of the byte slice is:
+//
+//     uint64 timestamp
+//     uint32 data length
+//     []byte data
+//
+func MarshalEntry(timestamp int64, data []byte) []byte {
+	v := make([]byte, 8+4, 8+4+len(data))
+	binary.BigEndian.PutUint64(v[0:8], uint64(timestamp))
+	binary.BigEndian.PutUint32(v[8:12], uint32(len(data)))
+	v = append(v, data...)
+	return v
+}
+
+// UnmarshalEntry decodes an entry into it's separate parts.
+// Returns the timestamp, data and the number of bytes read.
+// Returned byte slices point to the original slice.
+func UnmarshalEntry(v []byte) (timestamp int64, data []byte, n int) {
+	timestamp = int64(binary.BigEndian.Uint64(v[0:8]))
+	dataLen := binary.BigEndian.Uint32(v[8:12])
+	data = v[12+dataLen:]
+	return timestamp, data, 12 + int(dataLen)
+}
+
+// SplitEntries returns a slice of individual entries from one continuous set.
+func SplitEntries(b []byte) [][]byte {
+	var a [][]byte
+	for {
+		// Exit if there's no more data left.
+		if len(b) == 0 {
+			return a
+		}
+
+		// Create slice that points to underlying entry.
+		dataSize := entryDataSize(b)
+		a = append(a, b[0:entryHeaderSize+dataSize])
+
+		// Move buffer forward.
+		b = b[entryHeaderSize+dataSize:]
+	}
+}
+
+// DedupeEntries returns slices with unique keys (the first 8 bytes).
+func DedupeEntries(a [][]byte) [][]byte {
+	// Convert to a map where the last slice is used.
+	m := make(map[string][]byte)
+	for _, b := range a {
+		m[string(b[0:8])] = b
+	}
+
+	// Convert map back to a slice of byte slices.
+	other := make([][]byte, 0, len(m))
+	for _, v := range m {
+		other = append(other, v)
+	}
+
+	// Sort entries.
+	sort.Sort(byteSlices(other))
+
+	return other
+}
+
+// entryHeaderSize is the number of bytes required for the header.
+const entryHeaderSize = 8 + 4
+
+// entryDataSize returns the size of an entry's data field, in bytes.
+func entryDataSize(v []byte) int { return int(binary.BigEndian.Uint32(v[8:12])) }
+
+// u64tob converts a uint64 into an 8-byte slice.
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+// btou64 converts an 8-byte slice into an uint64.
+func btou64(b []byte) uint64 { return binary.BigEndian.Uint64(b) }
+
+type byteSlices [][]byte
+
+func (a byteSlices) Len() int           { return len(a) }
+func (a byteSlices) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a byteSlices) Less(i, j int) bool { return bytes.Compare(a[i], a[j]) == -1 }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/bz1/bz1_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/bz1/bz1_test.go
@ -0,0 +1,439 @@
+package bz1_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"reflect"
+	"sort"
+	"strconv"
+	"testing"
+	"testing/quick"
+	"time"
+
+	"github.com/influxdb/influxdb/tsdb"
+	"github.com/influxdb/influxdb/tsdb/engine/bz1"
+)
+
+// Ensure the engine can write series metadata and reload it.
+func TestEngine_LoadMetadataIndex_Series(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Setup nop mock.
+	e.PointsWriter.WritePointsFn = func(a []tsdb.Point) error { return nil }
+
+	// Write series metadata.
+	if err := e.WritePoints(nil, nil, []*tsdb.SeriesCreate{
+		{Series: &tsdb.Series{Key: string(tsdb.MakeKey([]byte("cpu"), map[string]string{"host": "server0"})), Tags: map[string]string{"host": "server0"}}},
+		{Series: &tsdb.Series{Key: string(tsdb.MakeKey([]byte("cpu"), map[string]string{"host": "server1"})), Tags: map[string]string{"host": "server1"}}},
+		{Series: &tsdb.Series{Key: "series with spaces"}},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	// Load metadata index.
+	index := tsdb.NewDatabaseIndex()
+	if err := e.LoadMetadataIndex(index, make(map[string]*tsdb.MeasurementFields)); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify index is correct.
+	if m := index.Measurement("cpu"); m == nil {
+		t.Fatal("measurement not found")
+	} else if s := m.SeriesByID(1); s.Key != "cpu,host=server0" || !reflect.DeepEqual(s.Tags, map[string]string{"host": "server0"}) {
+		t.Fatalf("unexpected series: %q / %#v", s.Key, s.Tags)
+	} else if s = m.SeriesByID(2); s.Key != "cpu,host=server1" || !reflect.DeepEqual(s.Tags, map[string]string{"host": "server1"}) {
+		t.Fatalf("unexpected series: %q / %#v", s.Key, s.Tags)
+	}
+
+	if m := index.Measurement("series with spaces"); m == nil {
+		t.Fatal("measurement not found")
+	} else if s := m.SeriesByID(3); s.Key != "series with spaces" {
+		t.Fatalf("unexpected series: %q", s.Key)
+	}
+}
+
+// Ensure the engine can write field metadata and reload it.
+func TestEngine_LoadMetadataIndex_Fields(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Setup nop mock.
+	e.PointsWriter.WritePointsFn = func(a []tsdb.Point) error { return nil }
+
+	// Write series metadata.
+	if err := e.WritePoints(nil, map[string]*tsdb.MeasurementFields{
+		"cpu": &tsdb.MeasurementFields{
+			Fields: map[string]*tsdb.Field{
+				"value": &tsdb.Field{ID: 0, Name: "value"},
+			},
+		},
+	}, nil); err != nil {
+		t.Fatal(err)
+	}
+
+	// Load metadata index.
+	mfs := make(map[string]*tsdb.MeasurementFields)
+	if err := e.LoadMetadataIndex(tsdb.NewDatabaseIndex(), mfs); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify measurement field is correct.
+	if mf := mfs["cpu"]; mf == nil {
+		t.Fatal("measurement fields not found")
+	} else if !reflect.DeepEqual(mf.Fields, map[string]*tsdb.Field{"value": &tsdb.Field{ID: 0, Name: "value"}}) {
+		t.Fatalf("unexpected fields: %#v", mf.Fields)
+	}
+}
+
+// Ensure the engine can write points to storage.
+func TestEngine_WritePoints_PointsWriter(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Points to be inserted.
+	points := []tsdb.Point{
+		tsdb.NewPoint("cpu", tsdb.Tags{}, tsdb.Fields{}, time.Unix(0, 1)),
+		tsdb.NewPoint("cpu", tsdb.Tags{}, tsdb.Fields{}, time.Unix(0, 0)),
+		tsdb.NewPoint("cpu", tsdb.Tags{}, tsdb.Fields{}, time.Unix(1, 0)),
+
+		tsdb.NewPoint("cpu", tsdb.Tags{"host": "serverA"}, tsdb.Fields{}, time.Unix(0, 0)),
+	}
+
+	// Mock points writer to ensure points are passed through.
+	var invoked bool
+	e.PointsWriter.WritePointsFn = func(a []tsdb.Point) error {
+		invoked = true
+		if !reflect.DeepEqual(points, a) {
+			t.Fatalf("unexpected points: %#v", a)
+		}
+		return nil
+	}
+
+	// Write points against two separate series.
+	if err := e.WritePoints(points, nil, nil); err != nil {
+		t.Fatal(err)
+	} else if !invoked {
+		t.Fatal("PointsWriter.WritePoints() not called")
+	}
+}
+
+// Ensure the engine can return errors from the points writer.
+func TestEngine_WritePoints_ErrPointsWriter(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Ensure points writer returns an error.
+	e.PointsWriter.WritePointsFn = func(a []tsdb.Point) error { return errors.New("marker") }
+
+	// Write to engine.
+	if err := e.WritePoints(nil, nil, nil); err == nil || err.Error() != `write points: marker` {
+		t.Fatal(err)
+	}
+}
+
+// Ensure the engine can write points to the index.
+func TestEngine_WriteIndex_Append(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Append points to index.
+	if err := e.WriteIndex(map[string][][]byte{
+		"cpu": [][]byte{
+			bz1.MarshalEntry(1, []byte{0x10}),
+			bz1.MarshalEntry(2, []byte{0x20}),
+		},
+		"mem": [][]byte{
+			bz1.MarshalEntry(0, []byte{0x30}),
+		},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	// Start transaction.
+	tx := e.MustBegin(false)
+	defer tx.Rollback()
+
+	// Iterate over "cpu" series.
+	c := tx.Cursor("cpu")
+	if k, v := c.Seek(u64tob(0)); !reflect.DeepEqual(k, []byte{0, 0, 0, 0, 0, 0, 0, 1}) || !reflect.DeepEqual(v, []byte{0x10}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); !reflect.DeepEqual(k, []byte{0, 0, 0, 0, 0, 0, 0, 2}) || !reflect.DeepEqual(v, []byte{0x20}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, _ = c.Next(); k != nil {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	}
+
+	// Iterate over "mem" series.
+	c = tx.Cursor("mem")
+	if k, v := c.Seek(u64tob(0)); !reflect.DeepEqual(k, []byte{0, 0, 0, 0, 0, 0, 0, 0}) || !reflect.DeepEqual(v, []byte{0x30}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, _ = c.Next(); k != nil {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	}
+}
+
+// Ensure the engine can rewrite blocks that contain the new point range.
+func TestEngine_WriteIndex_Insert(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+
+	// Write initial points to index.
+	if err := e.WriteIndex(map[string][][]byte{
+		"cpu": [][]byte{
+			bz1.MarshalEntry(10, []byte{0x10}),
+			bz1.MarshalEntry(20, []byte{0x20}),
+			bz1.MarshalEntry(30, []byte{0x30}),
+		},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	// Write overlapping points to index.
+	if err := e.WriteIndex(map[string][][]byte{
+		"cpu": [][]byte{
+			bz1.MarshalEntry(9, []byte{0x09}),
+			bz1.MarshalEntry(10, []byte{0xFF}),
+			bz1.MarshalEntry(25, []byte{0x25}),
+			bz1.MarshalEntry(31, []byte{0x31}),
+		},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	// Write overlapping points to index again.
+	if err := e.WriteIndex(map[string][][]byte{
+		"cpu": [][]byte{
+			bz1.MarshalEntry(31, []byte{0xFF}),
+		},
+	}); err != nil {
+		t.Fatal(err)
+	}
+
+	// Start transaction.
+	tx := e.MustBegin(false)
+	defer tx.Rollback()
+
+	// Iterate over "cpu" series.
+	c := tx.Cursor("cpu")
+	if k, v := c.Seek(u64tob(0)); btou64(k) != 9 || !bytes.Equal(v, []byte{0x09}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); btou64(k) != 10 || !bytes.Equal(v, []byte{0xFF}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); btou64(k) != 20 || !bytes.Equal(v, []byte{0x20}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); btou64(k) != 25 || !bytes.Equal(v, []byte{0x25}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); btou64(k) != 30 || !bytes.Equal(v, []byte{0x30}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	} else if k, v = c.Next(); btou64(k) != 31 || !bytes.Equal(v, []byte{0xFF}) {
+		t.Fatalf("unexpected key/value: %x / %x", k, v)
+	}
+}
+
+// Ensure the engine ignores writes without keys.
+func TestEngine_WriteIndex_NoKeys(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+	if err := e.WriteIndex(nil); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// Ensure the engine ignores writes without points in a key.
+func TestEngine_WriteIndex_NoPoints(t *testing.T) {
+	e := OpenDefaultEngine()
+	defer e.Close()
+	if err := e.WriteIndex(map[string][][]byte{"cpu": nil}); err != nil {
+		t.Fatal(err)
+	}
+}
+
+// Ensure the engine ignores writes without points in a key.
+func TestEngine_WriteIndex_Quick(t *testing.T) {
+	if testing.Short() {
+		t.Skip("short mode")
+	}
+
+	quick.Check(func(sets []Points, blockSize int) bool {
+		e := OpenDefaultEngine()
+		e.BlockSize = blockSize % 1024 // 1KB max block size
+		defer e.Close()
+
+		// Write points to index in multiple sets.
+		for _, set := range sets {
+			if err := e.WriteIndex(map[string][][]byte(set)); err != nil {
+				t.Fatal(err)
+			}
+		}
+
+		// Merge all points together.
+		points := MergePoints(sets)
+
+		// Retrieve a sorted list of keys so results are deterministic.
+		keys := points.Keys()
+
+		// Start transaction to read index.
+		tx := e.MustBegin(false)
+		defer tx.Rollback()
+
+		// Iterate over results to ensure they are correct.
+		for _, key := range keys {
+			c := tx.Cursor(key)
+
+			// Read list of key/values.
+			var got [][]byte
+			for k, v := c.Seek(u64tob(0)); k != nil; k, v = c.Next() {
+				got = append(got, append(copyBytes(k), v...))
+			}
+
+			// Generate expected values.
+			// We need to remove the data length from the slice.
+			var exp [][]byte
+			for _, b := range points[key] {
+				exp = append(exp, append(copyBytes(b[0:8]), b[12:]...)) // remove data len
+			}
+
+			if !reflect.DeepEqual(got, exp) {
+				t.Fatalf("points: block size=%d, key=%s:\n\ngot=%x\n\nexp=%x\n\n", e.BlockSize, key, got, exp)
+			}
+		}
+
+		return true
+	}, nil)
+}
+
+// Engine represents a test wrapper for bz1.Engine.
+type Engine struct {
+	*bz1.Engine
+	PointsWriter EnginePointsWriter
+}
+
+// NewEngine returns a new instance of Engine.
+func NewEngine(opt tsdb.EngineOptions) *Engine {
+	// Generate temporary file.
+	f, _ := ioutil.TempFile("", "bz1-")
+	f.Close()
+	os.Remove(f.Name())
+
+	// Create test wrapper and attach mocks.
+	e := &Engine{
+		Engine: bz1.NewEngine(f.Name(), opt).(*bz1.Engine),
+	}
+	e.Engine.PointsWriter = &e.PointsWriter
+	return e
+}
+
+// OpenEngine returns an opened instance of Engine. Panic on error.
+func OpenEngine(opt tsdb.EngineOptions) *Engine {
+	e := NewEngine(opt)
+	if err := e.Open(); err != nil {
+		panic(err)
+	}
+	return e
+}
+
+// OpenDefaultEngine returns an open Engine with default options.
+func OpenDefaultEngine() *Engine { return OpenEngine(tsdb.NewEngineOptions()) }
+
+// Close closes the engine and removes all data.
+func (e *Engine) Close() error {
+	e.Engine.Close()
+	os.RemoveAll(e.Path())
+	return nil
+}
+
+// MustBegin returns a new tranaction. Panic on error.
+func (e *Engine) MustBegin(writable bool) tsdb.Tx {
+	tx, err := e.Begin(writable)
+	if err != nil {
+		panic(err)
+	}
+	return tx
+}
+
+// EnginePointsWriter represents a mock that implements Engine.PointsWriter.
+type EnginePointsWriter struct {
+	WritePointsFn func(points []tsdb.Point) error
+}
+
+func (w *EnginePointsWriter) WritePoints(points []tsdb.Point) error {
+	return w.WritePointsFn(points)
+}
+
+// Points represents a set of encoded points by key. Implements quick.Generator.
+type Points map[string][][]byte
+
+// Keys returns a sorted list of keys.
+func (m Points) Keys() []string {
+	var keys []string
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	return keys
+}
+
+func (Points) Generate(rand *rand.Rand, size int) reflect.Value {
+	// Generate series with a random number of points in each.
+	m := make(map[string][][]byte)
+	for i, seriesN := 0, rand.Intn(size); i < seriesN; i++ {
+		key := strconv.Itoa(rand.Intn(20))
+
+		// Generate points for the series.
+		for j, pointN := 0, rand.Intn(size); j < pointN; j++ {
+			timestamp := time.Unix(0, 0).Add(time.Duration(rand.Intn(100)))
+			data, ok := quick.Value(reflect.TypeOf([]byte(nil)), rand)
+			if !ok {
+				panic("cannot generate data")
+			}
+			m[key] = append(m[key], bz1.MarshalEntry(timestamp.UnixNano(), data.Interface().([]byte)))
+		}
+	}
+
+	return reflect.ValueOf(Points(m))
+}
+
+// MergePoints returns a map of all points merged together by key.
+// Later points will overwrite earlier ones.
+func MergePoints(a []Points) Points {
+	// Combine all points into one set.
+	m := make(Points)
+	for _, set := range a {
+		for key, values := range set {
+			m[key] = append(m[key], values...)
+		}
+	}
+
+	// Dedupe points.
+	for key, values := range m {
+		m[key] = bz1.DedupeEntries(values)
+	}
+
+	return m
+}
+
+// copyBytes returns a copy of a byte slice.
+func copyBytes(b []byte) []byte {
+	if b == nil {
+		return nil
+	}
+
+	other := make([]byte, len(b))
+	copy(other, b)
+	return other
+}
+
+// u64tob converts a uint64 into an 8-byte slice.
+func u64tob(v uint64) []byte {
+	b := make([]byte, 8)
+	binary.BigEndian.PutUint64(b, v)
+	return b
+}
+
+// btou64 converts an 8-byte slice into an uint64.
+func btou64(b []byte) uint64 { return binary.BigEndian.Uint64(b) }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/engine.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine/engine.go
@ -0,0 +1,6 @@
+package engine
+
+import (
+	_ "github.com/influxdb/influxdb/tsdb/engine/b1"
+	_ "github.com/influxdb/influxdb/tsdb/engine/bz1"
+)
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/engine_test.go
@ -1,815 +1,3 @@
-package tsdb
+package tsdb_test

-import (
-	"fmt"
-	"io/ioutil"
-	"math"
-	"os"
-	"testing"
-	"time"
-
-	"github.com/influxdb/influxdb/influxql"
-	"github.com/influxdb/influxdb/meta"
-)
-
-var sID0 = uint64(1)
-var sID1 = uint64(2)
-var sgID1 = uint64(3)
-var sgID2 = uint64(4)
-var nID = uint64(42)
-
-// Simple test to ensure data can be read from two shards.
-func TestWritePointsAndExecuteTwoShards(t *testing.T) {
-	// Create the mock planner and its metastore
-	store, query_executor := testStoreAndQueryExecutor()
-	defer os.RemoveAll(store.path)
-	query_executor.MetaStore = &testQEMetastore{
-		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
-			return []meta.ShardGroupInfo{
-				{
-					ID:        sgID,
-					StartTime: time.Now().Add(-time.Hour),
-					EndTime:   time.Now().Add(time.Hour),
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID0),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-				{
-					ID:        sgID,
-					StartTime: time.Now().Add(-2 * time.Hour),
-					EndTime:   time.Now().Add(-time.Hour),
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID1),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-			}, nil
-		},
-	}
-
-	// Write two points across shards.
-	pt1time := time.Unix(1, 0).UTC()
-	if err := store.WriteToShard(sID0, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "serverA", "region": "us-east"},
-		map[string]interface{}{"value": 100},
-		pt1time,
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	pt2time := time.Unix(2, 0).UTC()
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "serverB", "region": "us-east"},
-		map[string]interface{}{"value": 200},
-		pt2time,
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-
-	var tests = []struct {
-		skip      bool   // Skip test
-		stmt      string // Query statement
-		chunkSize int    // Chunk size for driving the executor
-		expected  string // Expected results, rendered as a string
-	}{
-		{
-			stmt:     `SELECT value FROM cpu`,
-			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:      `SELECT value FROM cpu`,
-			chunkSize: 1,
-			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu LIMIT 1`,
-			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
-		},
-		{
-			stmt:      `SELECT value FROM cpu LIMIT 1`,
-			chunkSize: 2,
-			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverA'`,
-			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverB'`,
-			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverC'`,
-			expected: `null`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu GROUP BY host`,
-			expected: `[{"name":"cpu","tags":{"host":"serverA"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","tags":{"host":"serverB"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu GROUP BY region`,
-			expected: `[{"name":"cpu","tags":{"region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu GROUP BY host,region`,
-			expected: `[{"name":"cpu","tags":{"host":"serverA","region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","tags":{"host":"serverB","region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverA' GROUP BY host`,
-			expected: `[{"name":"cpu","tags":{"host":"serverA"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
-		},
-
-		// Aggregate queries.
-		{
-			stmt:     `SELECT sum(value) FROM cpu`,
-			expected: `[{"name":"cpu","columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",300]]}]`,
-		},
-	}
-
-	for _, tt := range tests {
-		if tt.skip {
-			t.Logf("Skipping test %s", tt.stmt)
-			continue
-		}
-		executor, err := query_executor.plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
-		if err != nil {
-			t.Fatalf("failed to plan query: %s", err.Error())
-		}
-		got := executeAndGetResults(executor)
-		if got != tt.expected {
-			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
-		}
-	}
-}
-
-// Test that executor correctly orders data across shards.
-func TestWritePointsAndExecuteTwoShardsAlign(t *testing.T) {
-	// Create the mock planner and its metastore
-	store, query_executor := testStoreAndQueryExecutor()
-	defer os.RemoveAll(store.path)
-	query_executor.MetaStore = &testQEMetastore{
-		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
-			return []meta.ShardGroupInfo{
-				{
-					ID:        sgID,
-					StartTime: time.Now().Add(-2 * time.Hour),
-					EndTime:   time.Now().Add(-time.Hour),
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID1),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-				{
-					ID:        sgID,
-					StartTime: time.Now().Add(-2 * time.Hour),
-					EndTime:   time.Now().Add(time.Hour),
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID0),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-			}, nil
-		},
-	}
-
-	// Write interleaving, by time, chunks to the shards.
-	if err := store.WriteToShard(sID0, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "serverA"},
-		map[string]interface{}{"value": 100},
-		time.Unix(1, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "serverB"},
-		map[string]interface{}{"value": 200},
-		time.Unix(2, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "serverA"},
-		map[string]interface{}{"value": 300},
-		time.Unix(3, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-
-	var tests = []struct {
-		skip      bool   // Skip test
-		stmt      string // Query statement
-		chunkSize int    // Chunk size for driving the executor
-		expected  string // Expected results, rendered as a string
-	}{
-		{
-			stmt:      `SELECT value FROM cpu`,
-			chunkSize: 1,
-			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:03Z",300]]}]`,
-		},
-		{
-			stmt:      `SELECT value FROM cpu`,
-			chunkSize: 2,
-			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:03Z",300]]}]`,
-		},
-		{
-			stmt:      `SELECT mean(value),sum(value) FROM cpu`,
-			chunkSize: 2,
-			expected:  `[{"name":"cpu","columns":["time","mean","sum"],"values":[["1970-01-01T00:00:00Z",200,600]]}]`,
-		},
-	}
-
-	for _, tt := range tests {
-		if tt.skip {
-			t.Logf("Skipping test %s", tt.stmt)
-			continue
-		}
-		executor, err := query_executor.plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
-		if err != nil {
-			t.Fatalf("failed to plan query: %s", err.Error())
-		}
-		got := executeAndGetResults(executor)
-		if got != tt.expected {
-			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
-		}
-	}
-}
-
-// Test that executor correctly orders data across shards when the tagsets
-// are not presented in alphabetically order across shards.
-func TestWritePointsAndExecuteTwoShardsTagSetOrdering(t *testing.T) {
-	// Create the mock planner and its metastore
-	store, query_executor := testStoreAndQueryExecutor()
-	defer os.RemoveAll(store.path)
-	query_executor.MetaStore = &testQEMetastore{
-		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
-			return []meta.ShardGroupInfo{
-				{
-					ID: sgID,
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID0),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-				{
-					ID: sgID,
-					Shards: []meta.ShardInfo{
-						{
-							ID:       uint64(sID1),
-							OwnerIDs: []uint64{nID},
-						},
-					},
-				},
-			}, nil
-		},
-	}
-
-	// Write tagsets "y" and "z" to first shard.
-	if err := store.WriteToShard(sID0, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "y"},
-		map[string]interface{}{"value": 100},
-		time.Unix(1, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	if err := store.WriteToShard(sID0, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "z"},
-		map[string]interface{}{"value": 200},
-		time.Unix(1, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-
-	// Write tagsets "x", y" and "z" to second shard.
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "x"},
-		map[string]interface{}{"value": 300},
-		time.Unix(2, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "y"},
-		map[string]interface{}{"value": 400},
-		time.Unix(3, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-	if err := store.WriteToShard(sID1, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "z"},
-		map[string]interface{}{"value": 500},
-		time.Unix(3, 0).UTC(),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-
-	var tests = []struct {
-		skip      bool   // Skip test
-		stmt      string // Query statement
-		chunkSize int    // Chunk size for driving the executor
-		expected  string // Expected results, rendered as a string
-	}{
-		{
-			stmt:     `SELECT sum(value) FROM cpu GROUP BY host`,
-			expected: `[{"name":"cpu","tags":{"host":"x"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",300]]},{"name":"cpu","tags":{"host":"y"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",500]]},{"name":"cpu","tags":{"host":"z"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",700]]}]`,
-		},
-		{
-			stmt:     `SELECT value FROM cpu GROUP BY host`,
-			expected: `[{"name":"cpu","tags":{"host":"x"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",300]]},{"name":"cpu","tags":{"host":"y"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:03Z",400]]},{"name":"cpu","tags":{"host":"z"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",200],["1970-01-01T00:00:03Z",500]]}]`,
-		},
-	}
-
-	for _, tt := range tests {
-		if tt.skip {
-			t.Logf("Skipping test %s", tt.stmt)
-			continue
-		}
-		executor, err := query_executor.plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
-		if err != nil {
-			t.Fatalf("failed to plan query: %s", err.Error())
-		}
-		got := executeAndGetResults(executor)
-		if got != tt.expected {
-			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
-		}
-	}
-}
-
-// TestProccessAggregateDerivative tests the rawQueryDerivativeProcessor transformation function on the engine.
-// The is called for a query with a GROUP BY.
-func TestProcessAggregateDerivative(t *testing.T) {
-	tests := []struct {
-		name     string
-		fn       string
-		interval time.Duration
-		in       [][]interface{}
-		exp      [][]interface{}
-	}{
-		{
-			name:     "empty input",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in:       [][]interface{}{},
-			exp:      [][]interface{}{},
-		},
-
-		{
-			name:     "single row returns 0.0",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 0.0,
-				},
-			},
-		},
-		{
-			name:     "basic derivative",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 3.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 5.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 9.0,
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-		},
-		{
-			name:     "12h interval",
-			fn:       "derivative",
-			interval: 12 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 3.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 0.5,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 0.5,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 0.5,
-				},
-			},
-		},
-		{
-			name:     "negative derivatives",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 0.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), -2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-		},
-		{
-			name:     "negative derivatives",
-			fn:       "non_negative_derivative",
-			interval: 24 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
-				},
-				// Show resultes in negative derivative
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 0.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-		},
-		{
-			name:     "float derivatives",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0), 1.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), int64(3),
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), int64(5),
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), int64(9),
-				},
-			},
-			exp: [][]interface{}{
-				[]interface{}{
-					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(48 * time.Hour), 2.0,
-				},
-				[]interface{}{
-					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
-				},
-			},
-		},
-	}
-
-	for _, test := range tests {
-		got := processAggregateDerivative(test.in, test.fn == "non_negative_derivative", test.interval)
-
-		if len(got) != len(test.exp) {
-			t.Fatalf("processAggregateDerivative(%s) - %s\nlen mismatch: got %d, exp %d", test.fn, test.name, len(got), len(test.exp))
-		}
-
-		for i := 0; i < len(test.exp); i++ {
-			if test.exp[i][0] != got[i][0] || test.exp[i][1] != got[i][1] {
-				t.Fatalf("processAggregateDerivative - %s results mismatch:\ngot %v\nexp %v", test.name, got, test.exp)
-			}
-		}
-	}
-}
-
-// TestProcessRawQueryDerivative tests the rawQueryDerivativeProcessor transformation function on the engine.
-// The is called for a queries that do not have a group by.
-func TestProcessRawQueryDerivative(t *testing.T) {
-	tests := []struct {
-		name     string
-		fn       string
-		interval time.Duration
-		in       []*mapperValue
-		exp      []*mapperValue
-	}{
-		{
-			name:     "empty input",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in:       []*mapperValue{},
-			exp:      []*mapperValue{},
-		},
-
-		{
-			name:     "single row returns 0.0",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Unix(),
-					Value: 1.0,
-				},
-			},
-			exp: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Unix(),
-					Value: 0.0,
-				},
-			},
-		},
-		{
-			name:     "basic derivative",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Unix(),
-					Value: 0.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 3.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 5.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 9.0,
-				},
-			},
-			exp: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 3.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 2.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-		},
-		{
-			name:     "12h interval",
-			fn:       "derivative",
-			interval: 12 * time.Hour,
-			in: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).UnixNano(),
-					Value: 1.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 2.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 3.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-			exp: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 0.5,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 0.5,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 0.5,
-				},
-			},
-		},
-		{
-			name:     "negative derivatives",
-			fn:       "derivative",
-			interval: 24 * time.Hour,
-			in: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Unix(),
-					Value: 1.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 2.0,
-				},
-				// should go negative
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 0.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-			exp: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 1.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: -2.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-		},
-		{
-			name:     "negative derivatives",
-			fn:       "non_negative_derivative",
-			interval: 24 * time.Hour,
-			in: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Unix(),
-					Value: 1.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 2.0,
-				},
-				// should go negative
-				{
-					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
-					Value: 0.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-			exp: []*mapperValue{
-				{
-					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
-					Value: 1.0,
-				},
-				{
-					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
-					Value: 4.0,
-				},
-			},
-		},
-	}
-
-	for _, test := range tests {
-		p := rawQueryDerivativeProcessor{
-			isNonNegative:      test.fn == "non_negative_derivative",
-			derivativeInterval: test.interval,
-		}
-		got := p.process(test.in)
-
-		if len(got) != len(test.exp) {
-			t.Fatalf("rawQueryDerivativeProcessor(%s) - %s\nlen mismatch: got %d, exp %d", test.fn, test.name, len(got), len(test.exp))
-		}
-
-		for i := 0; i < len(test.exp); i++ {
-			fmt.Println("Times:", test.exp[i].Time, got[i].Time)
-			if test.exp[i].Time != got[i].Time || math.Abs((test.exp[i].Value.(float64)-got[i].Value.(float64))) > 0.0000001 {
-				t.Fatalf("rawQueryDerivativeProcessor - %s results mismatch:\ngot %v\nexp %v", test.name, got, test.exp)
-			}
-		}
-	}
-}
-
-type testQEMetastore struct {
-	sgFunc func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error)
-}
-
-func (t *testQEMetastore) ShardGroupsByTimeRange(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
-	return t.sgFunc(database, policy, min, max)
-}
-
-func (t *testQEMetastore) Database(name string) (*meta.DatabaseInfo, error) { return nil, nil }
-func (t *testQEMetastore) Databases() ([]meta.DatabaseInfo, error)          { return nil, nil }
-func (t *testQEMetastore) User(name string) (*meta.UserInfo, error)         { return nil, nil }
-func (t *testQEMetastore) AdminUserExists() (bool, error)                   { return false, nil }
-func (t *testQEMetastore) Authenticate(username, password string) (*meta.UserInfo, error) {
-	return nil, nil
-}
-func (t *testQEMetastore) RetentionPolicy(database, name string) (rpi *meta.RetentionPolicyInfo, err error) {
-	return nil, nil
-}
-func (t *testQEMetastore) UserCount() (int, error) { return 0, nil }
-
-func (t *testQEMetastore) NodeID() uint64 { return nID }
-
-func testStoreAndQueryExecutor() (*Store, *QueryExecutor) {
-	path, _ := ioutil.TempDir("", "")
-
-	store := NewStore(path)
-	err := store.Open()
-	if err != nil {
-		panic(err)
-	}
-	database := "foo"
-	retentionPolicy := "bar"
-	store.CreateShard(database, retentionPolicy, sID0)
-	store.CreateShard(database, retentionPolicy, sID1)
-
-	query_executor := NewQueryExecutor(store)
-	query_executor.ShardMapper = &testQEShardMapper{store}
-
-	return store, query_executor
-}
-
-type testQEShardMapper struct {
-	store *Store
-}
-
-func (t *testQEShardMapper) CreateMapper(shard meta.ShardInfo, stmt string, chunkSize int) (Mapper, error) {
-	return t.store.CreateMapper(shard.ID, stmt, chunkSize)
-}
-
-func executeAndGetResults(executor Executor) string {
-	ch := executor.Execute()
-
-	var rows []*influxql.Row
-	for r := range ch {
-		rows = append(rows, r)
-	}
-	return string(mustMarshalJSON(rows))
-}
+import _ "github.com/influxdb/influxdb/tsdb/engine"
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/executor.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/executor.go
@ -0,0 +1,981 @@
+package tsdb
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"time"
+
+	"github.com/influxdb/influxdb/influxql"
+)
+
+const (
+	// Return an error if the user is trying to select more than this number of points in a group by statement.
+	// Most likely they specified a group by interval without time boundaries.
+	MaxGroupByPoints = 100000
+
+	// Since time is always selected, the column count when selecting only a single other value will be 2
+	SelectColumnCountWithOneValue = 2
+
+	// IgnoredChunkSize is what gets passed into Mapper.Begin for aggregate queries as they don't chunk points out
+	IgnoredChunkSize = 0
+)
+
+// Mapper is the interface all Mapper types must implement.
+type Mapper interface {
+	Open() error
+	TagSets() []string
+	Fields() []string
+	NextChunk() (interface{}, error)
+	Close()
+}
+
+// StatefulMapper encapsulates a Mapper and some state that the executor needs to
+// track for that mapper.
+type StatefulMapper struct {
+	Mapper
+	bufferedChunk *MapperOutput // Last read chunk.
+	drained       bool
+}
+
+// NextChunk wraps a RawMapper and some state.
+func (sm *StatefulMapper) NextChunk() (*MapperOutput, error) {
+	c, err := sm.Mapper.NextChunk()
+	if err != nil {
+		return nil, err
+	}
+	chunk, ok := c.(*MapperOutput)
+	if !ok {
+		if chunk == interface{}(nil) {
+			return nil, nil
+		}
+	}
+	return chunk, nil
+}
+
+type Executor struct {
+	stmt           *influxql.SelectStatement
+	mappers        []*StatefulMapper
+	chunkSize      int
+	limitedTagSets map[string]struct{} // Set tagsets for which data has reached the LIMIT.
+}
+
+// NewExecutor returns a new Executor.
+func NewExecutor(stmt *influxql.SelectStatement, mappers []Mapper, chunkSize int) *Executor {
+	a := []*StatefulMapper{}
+	for _, m := range mappers {
+		a = append(a, &StatefulMapper{m, nil, false})
+	}
+	return &Executor{
+		stmt:           stmt,
+		mappers:        a,
+		chunkSize:      chunkSize,
+		limitedTagSets: make(map[string]struct{}),
+	}
+}
+
+// Execute begins execution of the query and returns a channel to receive rows.
+func (e *Executor) Execute() <-chan *influxql.Row {
+	// Create output channel and stream data in a separate goroutine.
+	out := make(chan *influxql.Row, 0)
+
+	// Certain operations on the SELECT statement can be performed by the Executor without
+	// assistance from the Mappers. This allows the Executor to prepare aggregation functions
+	// and mathematical functions.
+	e.stmt.RewriteDistinct()
+
+	if (e.stmt.IsRawQuery && !e.stmt.HasDistinct()) || e.stmt.IsSimpleDerivative() {
+		go e.executeRaw(out)
+	} else {
+		go e.executeAggregate(out)
+	}
+	return out
+}
+
+// mappersDrained returns whether all the executors Mappers have been drained of data.
+func (e *Executor) mappersDrained() bool {
+	for _, m := range e.mappers {
+		if !m.drained {
+			return false
+		}
+	}
+	return true
+}
+
+// nextMapperTagset returns the alphabetically lowest tagset across all Mappers.
+func (e *Executor) nextMapperTagSet() string {
+	tagset := ""
+	for _, m := range e.mappers {
+		if m.bufferedChunk != nil {
+			if tagset == "" {
+				tagset = m.bufferedChunk.key()
+			} else if m.bufferedChunk.key() < tagset {
+				tagset = m.bufferedChunk.key()
+			}
+		}
+	}
+	return tagset
+}
+
+// nextMapperLowestTime returns the lowest minimum time across all Mappers, for the given tagset.
+func (e *Executor) nextMapperLowestTime(tagset string) int64 {
+	minTime := int64(math.MaxInt64)
+	for _, m := range e.mappers {
+		if !m.drained && m.bufferedChunk != nil {
+			if m.bufferedChunk.key() != tagset {
+				continue
+			}
+			t := m.bufferedChunk.Values[len(m.bufferedChunk.Values)-1].Time
+			if t < minTime {
+				minTime = t
+			}
+		}
+	}
+	return minTime
+}
+
+// tagSetIsLimited returns whether data for the given tagset has been LIMITed.
+func (e *Executor) tagSetIsLimited(tagset string) bool {
+	_, ok := e.limitedTagSets[tagset]
+	return ok
+}
+
+// limitTagSet marks the given taset as LIMITed.
+func (e *Executor) limitTagSet(tagset string) {
+	e.limitedTagSets[tagset] = struct{}{}
+}
+
+func (e *Executor) executeRaw(out chan *influxql.Row) {
+	// It's important that all resources are released when execution completes.
+	defer e.close()
+
+	// Open the mappers.
+	for _, m := range e.mappers {
+		if err := m.Open(); err != nil {
+			out <- &influxql.Row{Err: err}
+			return
+		}
+	}
+
+	// Get the distinct fields across all mappers.
+	var selectFields, aliasFields []string
+	if e.stmt.HasWildcard() {
+		sf := newStringSet()
+		for _, m := range e.mappers {
+			sf.add(m.Fields()...)
+		}
+		selectFields = sf.list()
+		aliasFields = selectFields
+	} else {
+		selectFields = e.stmt.Fields.Names()
+		aliasFields = e.stmt.Fields.AliasNames()
+	}
+
+	// Used to read ahead chunks from mappers.
+	var rowWriter *limitedRowWriter
+	var currTagset string
+
+	// Keep looping until all mappers drained.
+	var err error
+	for {
+		// Get the next chunk from each Mapper.
+		for _, m := range e.mappers {
+			if m.drained {
+				continue
+			}
+
+			// Set the next buffered chunk on the mapper, or mark it drained.
+			for {
+				if m.bufferedChunk == nil {
+					m.bufferedChunk, err = m.NextChunk()
+					if err != nil {
+						out <- &influxql.Row{Err: err}
+						return
+					}
+					if m.bufferedChunk == nil {
+						// Mapper can do no more for us.
+						m.drained = true
+						break
+					}
+
+					// If the SELECT query is on more than 1 field, but the chunks values from the Mappers
+					// only contain a single value, create k-v pairs using the field name of the chunk
+					// and the value of the chunk. If there is only 1 SELECT field across all mappers then
+					// there is no need to create k-v pairs, and there is no need to distinguish field data,
+					// as it is all for the *same* field.
+					if len(selectFields) > 1 && len(m.bufferedChunk.Fields) == 1 {
+						fieldKey := m.bufferedChunk.Fields[0]
+
+						for i := range m.bufferedChunk.Values {
+							field := map[string]interface{}{fieldKey: m.bufferedChunk.Values[i].Value}
+							m.bufferedChunk.Values[i].Value = field
+						}
+					}
+				}
+
+				if e.tagSetIsLimited(m.bufferedChunk.Name) {
+					// chunk's tagset is limited, so no good. Try again.
+					m.bufferedChunk = nil
+					continue
+				}
+				// This mapper has a chunk available, and it is not limited.
+				break
+			}
+		}
+
+		// All Mappers done?
+		if e.mappersDrained() {
+			rowWriter.Flush()
+			break
+		}
+
+		// Send out data for the next alphabetically-lowest tagset. All Mappers emit data in this order,
+		// so by always continuing with the lowest tagset until it is finished, we process all data in
+		// the required order, and don't "miss" any.
+		tagset := e.nextMapperTagSet()
+		if tagset != currTagset {
+			currTagset = tagset
+			// Tagset has changed, time for a new rowWriter. Be sure to kick out any residual values.
+			rowWriter.Flush()
+			rowWriter = nil
+		}
+
+		// Process the mapper outputs. We can send out everything up to the min of the last time
+		// of the chunks for the next tagset.
+		minTime := e.nextMapperLowestTime(tagset)
+
+		// Now empty out all the chunks up to the min time. Create new output struct for this data.
+		var chunkedOutput *MapperOutput
+		for _, m := range e.mappers {
+			if m.drained {
+				continue
+			}
+
+			// This mapper's next chunk is not for the next tagset, or the very first value of
+			// the chunk is at a higher acceptable timestamp. Skip it.
+			if m.bufferedChunk.key() != tagset || m.bufferedChunk.Values[0].Time > minTime {
+				continue
+			}
+
+			// Find the index of the point up to the min.
+			ind := len(m.bufferedChunk.Values)
+			for i, mo := range m.bufferedChunk.Values {
+				if mo.Time > minTime {
+					ind = i
+					break
+				}
+			}
+
+			// Add up to the index to the values
+			if chunkedOutput == nil {
+				chunkedOutput = &MapperOutput{
+					Name:      m.bufferedChunk.Name,
+					Tags:      m.bufferedChunk.Tags,
+					cursorKey: m.bufferedChunk.key(),
+				}
+				chunkedOutput.Values = m.bufferedChunk.Values[:ind]
+			} else {
+				chunkedOutput.Values = append(chunkedOutput.Values, m.bufferedChunk.Values[:ind]...)
+			}
+
+			// Clear out the values being sent out, keep the remainder.
+			m.bufferedChunk.Values = m.bufferedChunk.Values[ind:]
+
+			// If we emptied out all the values, clear the mapper's buffered chunk.
+			if len(m.bufferedChunk.Values) == 0 {
+				m.bufferedChunk = nil
+			}
+		}
+
+		// Sort the values by time first so we can then handle offset and limit
+		sort.Sort(MapperValues(chunkedOutput.Values))
+
+		// Now that we have full name and tag details, initialize the rowWriter.
+		// The Name and Tags will be the same for all mappers.
+		if rowWriter == nil {
+			rowWriter = &limitedRowWriter{
+				limit:       e.stmt.Limit,
+				offset:      e.stmt.Offset,
+				chunkSize:   e.chunkSize,
+				name:        chunkedOutput.Name,
+				tags:        chunkedOutput.Tags,
+				selectNames: selectFields,
+				aliasNames:  aliasFields,
+				fields:      e.stmt.Fields,
+				c:           out,
+			}
+		}
+		if e.stmt.HasDerivative() {
+			interval, err := derivativeInterval(e.stmt)
+			if err != nil {
+				out <- &influxql.Row{Err: err}
+				return
+			}
+			rowWriter.transformer = &RawQueryDerivativeProcessor{
+				IsNonNegative:      e.stmt.FunctionCalls()[0].Name == "non_negative_derivative",
+				DerivativeInterval: interval,
+			}
+		}
+
+		// Emit the data via the limiter.
+		if limited := rowWriter.Add(chunkedOutput.Values); limited {
+			// Limit for this tagset was reached, mark it and start draining a new tagset.
+			e.limitTagSet(chunkedOutput.key())
+			continue
+		}
+	}
+
+	close(out)
+}
+
+func (e *Executor) executeAggregate(out chan *influxql.Row) {
+	// It's important to close all resources when execution completes.
+	defer e.close()
+
+	// Create the functions which will reduce values from mappers for
+	// a given interval. The function offsets within this slice match
+	// the offsets within the value slices that are returned by the
+	// mapper.
+	aggregates := e.stmt.FunctionCalls()
+	reduceFuncs := make([]influxql.ReduceFunc, len(aggregates))
+	for i, c := range aggregates {
+		reduceFunc, err := influxql.InitializeReduceFunc(c)
+		if err != nil {
+			out <- &influxql.Row{Err: err}
+			return
+		}
+		reduceFuncs[i] = reduceFunc
+	}
+
+	// Put together the rows to return, starting with columns.
+	columnNames := make([]string, len(e.stmt.Fields)+1)
+	columnNames[0] = "time"
+	for i, f := range e.stmt.Fields {
+		columnNames[i+1] = f.Name()
+	}
+
+	// Open the mappers.
+	for _, m := range e.mappers {
+		if err := m.Open(); err != nil {
+			out <- &influxql.Row{Err: err}
+			return
+		}
+	}
+
+	// Build the set of available tagsets across all mappers. This is used for
+	// later checks.
+	availTagSets := newStringSet()
+	for _, m := range e.mappers {
+		for _, t := range m.TagSets() {
+			availTagSets.add(t)
+		}
+	}
+
+	// Prime each mapper's chunk buffer.
+	var err error
+	for _, m := range e.mappers {
+		m.bufferedChunk, err = m.NextChunk()
+		if err != nil {
+			out <- &influxql.Row{Err: err}
+			return
+		}
+		if m.bufferedChunk == nil {
+			m.drained = true
+		}
+	}
+
+	// Keep looping until all mappers drained.
+	for !e.mappersDrained() {
+		// Send out data for the next alphabetically-lowest tagset. All Mappers send out in this order
+		// so collect data for this tagset, ignoring all others.
+		tagset := e.nextMapperTagSet()
+		chunks := []*MapperOutput{}
+
+		// Pull as much as possible from each mapper. Stop when a mapper offers
+		// data for a new tagset, or empties completely.
+		for _, m := range e.mappers {
+			if m.drained {
+				continue
+			}
+
+			for {
+				if m.bufferedChunk == nil {
+					m.bufferedChunk, err = m.NextChunk()
+					if err != nil {
+						out <- &influxql.Row{Err: err}
+						return
+					}
+					if m.bufferedChunk == nil {
+						m.drained = true
+						break
+					}
+				}
+
+				// Got a chunk. Can we use it?
+				if m.bufferedChunk.key() != tagset {
+					// No, so just leave it in the buffer.
+					break
+				}
+				// We can, take it.
+				chunks = append(chunks, m.bufferedChunk)
+				m.bufferedChunk = nil
+			}
+		}
+
+		// Prep a row, ready for kicking out.
+		var row *influxql.Row
+
+		// Prep for bucketing data by start time of the interval.
+		buckets := map[int64][][]interface{}{}
+
+		for _, chunk := range chunks {
+			if row == nil {
+				row = &influxql.Row{
+					Name:    chunk.Name,
+					Tags:    chunk.Tags,
+					Columns: columnNames,
+				}
+			}
+
+			startTime := chunk.Values[0].Time
+			_, ok := buckets[startTime]
+			values := chunk.Values[0].Value.([]interface{})
+			if !ok {
+				buckets[startTime] = make([][]interface{}, len(values))
+			}
+			for i, v := range values {
+				buckets[startTime][i] = append(buckets[startTime][i], v)
+			}
+		}
+
+		// Now, after the loop above, within each time bucket is a slice. Within the element of each
+		// slice is another slice of interface{}, ready for passing to the reducer functions.
+
+		// Work each bucket of time, in time ascending order.
+		tMins := make(int64arr, 0, len(buckets))
+		for k, _ := range buckets {
+			tMins = append(tMins, k)
+		}
+		sort.Sort(tMins)
+
+		values := make([][]interface{}, len(tMins))
+		for i, t := range tMins {
+			values[i] = make([]interface{}, 0, len(columnNames))
+			values[i] = append(values[i], time.Unix(0, t).UTC()) // Time value is always first.
+
+			for j, f := range reduceFuncs {
+				reducedVal := f(buckets[t][j])
+				values[i] = append(values[i], reducedVal)
+			}
+		}
+
+		// Perform any mathematics.
+		values = processForMath(e.stmt.Fields, values)
+
+		// Handle any fill options
+		values = e.processFill(values)
+
+		// process derivatives
+		values = e.processDerivative(values)
+
+		// If we have multiple tag sets we'll want to filter out the empty ones
+		if len(availTagSets) > 1 && resultsEmpty(values) {
+			continue
+		}
+
+		row.Values = values
+		out <- row
+	}
+
+	close(out)
+}
+
+// processFill will take the results and return new results (or the same if no fill modifications are needed)
+// with whatever fill options the query has.
+func (e *Executor) processFill(results [][]interface{}) [][]interface{} {
+	// don't do anything if we're supposed to leave the nulls
+	if e.stmt.Fill == influxql.NullFill {
+		return results
+	}
+
+	if e.stmt.Fill == influxql.NoFill {
+		// remove any rows that have even one nil value. This one is tricky because they could have multiple
+		// aggregates, but this option means that any row that has even one nil gets purged.
+		newResults := make([][]interface{}, 0, len(results))
+		for _, vals := range results {
+			hasNil := false
+			// start at 1 because the first value is always time
+			for j := 1; j < len(vals); j++ {
+				if vals[j] == nil {
+					hasNil = true
+					break
+				}
+			}
+			if !hasNil {
+				newResults = append(newResults, vals)
+			}
+		}
+		return newResults
+	}
+
+	// They're either filling with previous values or a specific number
+	for i, vals := range results {
+		// start at 1 because the first value is always time
+		for j := 1; j < len(vals); j++ {
+			if vals[j] == nil {
+				switch e.stmt.Fill {
+				case influxql.PreviousFill:
+					if i != 0 {
+						vals[j] = results[i-1][j]
+					}
+				case influxql.NumberFill:
+					vals[j] = e.stmt.FillValue
+				}
+			}
+		}
+	}
+	return results
+}
+
+// processDerivative returns the derivatives of the results
+func (e *Executor) processDerivative(results [][]interface{}) [][]interface{} {
+	// Return early if we're not supposed to process the derivatives
+	if e.stmt.HasDerivative() {
+		interval, err := derivativeInterval(e.stmt)
+		if err != nil {
+			return results // XXX need to handle this better.
+		}
+
+		// Determines whether to drop negative differences
+		isNonNegative := e.stmt.FunctionCalls()[0].Name == "non_negative_derivative"
+		return ProcessAggregateDerivative(results, isNonNegative, interval)
+	}
+	return results
+}
+
+// Close closes the executor such that all resources are released. Once closed,
+// an executor may not be re-used.
+func (e *Executor) close() {
+	if e != nil {
+		for _, m := range e.mappers {
+			m.Close()
+		}
+	}
+}
+
+// limitedRowWriter accepts raw mapper values, and will emit those values as rows in chunks
+// of the given size. If the chunk size is 0, no chunking will be performed. In addiiton if
+// limit is reached, outstanding values will be emitted. If limit is zero, no limit is enforced.
+type limitedRowWriter struct {
+	chunkSize   int
+	limit       int
+	offset      int
+	name        string
+	tags        map[string]string
+	fields      influxql.Fields
+	selectNames []string
+	aliasNames  []string
+	c           chan *influxql.Row
+
+	currValues  []*MapperValue
+	totalOffSet int
+	totalSent   int
+
+	transformer interface {
+		Process(input []*MapperValue) []*MapperValue
+	}
+}
+
+// Add accepts a slice of values, and will emit those values as per chunking requirements.
+// If limited is returned as true, the limit was also reached and no more values should be
+// added. In that case only up the limit of values are emitted.
+func (r *limitedRowWriter) Add(values []*MapperValue) (limited bool) {
+	if r.currValues == nil {
+		r.currValues = make([]*MapperValue, 0, r.chunkSize)
+	}
+
+	// Enforce offset.
+	if r.totalOffSet < r.offset {
+		// Still some offsetting to do.
+		offsetRequired := r.offset - r.totalOffSet
+		if offsetRequired >= len(values) {
+			r.totalOffSet += len(values)
+			return false
+		} else {
+			// Drop leading values and keep going.
+			values = values[offsetRequired:]
+			r.totalOffSet += offsetRequired
+		}
+	}
+	r.currValues = append(r.currValues, values...)
+
+	// Check limit.
+	limitReached := r.limit > 0 && r.totalSent+len(r.currValues) >= r.limit
+	if limitReached {
+		// Limit will be satified with current values. Truncate 'em.
+		r.currValues = r.currValues[:r.limit-r.totalSent]
+	}
+
+	// Is chunking in effect?
+	if r.chunkSize != IgnoredChunkSize {
+		// Chunking level reached?
+		for len(r.currValues) >= r.chunkSize {
+			index := len(r.currValues) - (len(r.currValues) - r.chunkSize)
+			r.c <- r.processValues(r.currValues[:index])
+			r.currValues = r.currValues[index:]
+		}
+
+		// After values have been sent out by chunking, there may still be some
+		// values left, if the remainder is less than the chunk size. But if the
+		// limit has been reached, kick them out.
+		if len(r.currValues) > 0 && limitReached {
+			r.c <- r.processValues(r.currValues)
+			r.currValues = nil
+		}
+	} else if limitReached {
+		// No chunking in effect, but the limit has been reached.
+		r.c <- r.processValues(r.currValues)
+		r.currValues = nil
+	}
+
+	return limitReached
+}
+
+// Flush instructs the limitedRowWriter to emit any pending values as a single row,
+// adhering to any limits. Chunking is not enforced.
+func (r *limitedRowWriter) Flush() {
+	if r == nil {
+		return
+	}
+
+	// If at least some rows were sent, and no values are pending, then don't
+	// emit anything, since at least 1 row was previously emitted. This ensures
+	// that if no rows were ever sent, at least 1 will be emitted, even an empty row.
+	if r.totalSent != 0 && len(r.currValues) == 0 {
+		return
+	}
+
+	if r.limit > 0 && len(r.currValues) > r.limit {
+		r.currValues = r.currValues[:r.limit]
+	}
+	r.c <- r.processValues(r.currValues)
+	r.currValues = nil
+}
+
+// processValues emits the given values in a single row.
+func (r *limitedRowWriter) processValues(values []*MapperValue) *influxql.Row {
+	defer func() {
+		r.totalSent += len(values)
+	}()
+
+	selectNames := r.selectNames
+	aliasNames := r.aliasNames
+
+	if r.transformer != nil {
+		values = r.transformer.Process(values)
+	}
+
+	// ensure that time is in the select names and in the first position
+	hasTime := false
+	for i, n := range selectNames {
+		if n == "time" {
+			// Swap time to the first argument for names
+			if i != 0 {
+				selectNames[0], selectNames[i] = selectNames[i], selectNames[0]
+			}
+			hasTime = true
+			break
+		}
+	}
+
+	// time should always be in the list of names they get back
+	if !hasTime {
+		selectNames = append([]string{"time"}, selectNames...)
+		aliasNames = append([]string{"time"}, aliasNames...)
+	}
+
+	// since selectNames can contain tags, we need to strip them out
+	selectFields := make([]string, 0, len(selectNames))
+	aliasFields := make([]string, 0, len(selectNames))
+
+	for i, n := range selectNames {
+		if _, found := r.tags[n]; !found {
+			selectFields = append(selectFields, n)
+			aliasFields = append(aliasFields, aliasNames[i])
+		}
+	}
+
+	row := &influxql.Row{
+		Name:    r.name,
+		Tags:    r.tags,
+		Columns: aliasFields,
+	}
+
+	// Kick out an empty row it no results available.
+	if len(values) == 0 {
+		return row
+	}
+
+	// if they've selected only a single value we have to handle things a little differently
+	singleValue := len(selectFields) == SelectColumnCountWithOneValue
+
+	// the results will have all of the raw mapper results, convert into the row
+	for _, v := range values {
+		vals := make([]interface{}, len(selectFields))
+
+		if singleValue {
+			vals[0] = time.Unix(0, v.Time).UTC()
+			switch val := v.Value.(type) {
+			case map[string]interface{}:
+				vals[1] = val[selectFields[1]]
+			default:
+				vals[1] = val
+			}
+		} else {
+			fields := v.Value.(map[string]interface{})
+
+			// time is always the first value
+			vals[0] = time.Unix(0, v.Time).UTC()
+
+			// populate the other values
+			for i := 1; i < len(selectFields); i++ {
+				f, ok := fields[selectFields[i]]
+				if ok {
+					vals[i] = f
+					continue
+				}
+				if v.Tags != nil {
+					f, ok = v.Tags[selectFields[i]]
+					if ok {
+						vals[i] = f
+					}
+				}
+			}
+		}
+
+		row.Values = append(row.Values, vals)
+	}
+
+	// Perform any mathematical post-processing.
+	row.Values = processForMath(r.fields, row.Values)
+
+	return row
+}
+
+type RawQueryDerivativeProcessor struct {
+	LastValueFromPreviousChunk *MapperValue
+	IsNonNegative              bool // Whether to drop negative differences
+	DerivativeInterval         time.Duration
+}
+
+func (rqdp *RawQueryDerivativeProcessor) canProcess(input []*MapperValue) bool {
+	// If we only have 1 value, then the value did not change, so return
+	// a single row with 0.0
+	if len(input) == 1 {
+		return false
+	}
+
+	// See if the field value is numeric, if it's not, we can't process the derivative
+	validType := false
+	switch input[0].Value.(type) {
+	case int64:
+		validType = true
+	case float64:
+		validType = true
+	}
+
+	return validType
+}
+
+func (rqdp *RawQueryDerivativeProcessor) Process(input []*MapperValue) []*MapperValue {
+	if len(input) == 0 {
+		return input
+	}
+
+	if !rqdp.canProcess(input) {
+		return []*MapperValue{
+			&MapperValue{
+				Time:  input[0].Time,
+				Value: 0.0,
+			},
+		}
+	}
+
+	if rqdp.LastValueFromPreviousChunk == nil {
+		rqdp.LastValueFromPreviousChunk = input[0]
+	}
+
+	derivativeValues := []*MapperValue{}
+	for i := 1; i < len(input); i++ {
+		v := input[i]
+
+		// Calculate the derivative of successive points by dividing the difference
+		// of each value by the elapsed time normalized to the interval
+		diff := int64toFloat64(v.Value) - int64toFloat64(rqdp.LastValueFromPreviousChunk.Value)
+
+		elapsed := v.Time - rqdp.LastValueFromPreviousChunk.Time
+
+		value := 0.0
+		if elapsed > 0 {
+			value = diff / (float64(elapsed) / float64(rqdp.DerivativeInterval))
+		}
+
+		rqdp.LastValueFromPreviousChunk = v
+
+		// Drop negative values for non-negative derivatives
+		if rqdp.IsNonNegative && diff < 0 {
+			continue
+		}
+
+		derivativeValues = append(derivativeValues, &MapperValue{
+			Time:  v.Time,
+			Value: value,
+		})
+	}
+
+	return derivativeValues
+}
+
+// processForMath will apply any math that was specified in the select statement
+// against the passed in results
+func processForMath(fields influxql.Fields, results [][]interface{}) [][]interface{} {
+	hasMath := false
+	for _, f := range fields {
+		if _, ok := f.Expr.(*influxql.BinaryExpr); ok {
+			hasMath = true
+		} else if _, ok := f.Expr.(*influxql.ParenExpr); ok {
+			hasMath = true
+		}
+	}
+
+	if !hasMath {
+		return results
+	}
+
+	processors := make([]influxql.Processor, len(fields))
+	startIndex := 1
+	for i, f := range fields {
+		processors[i], startIndex = influxql.GetProcessor(f.Expr, startIndex)
+	}
+
+	mathResults := make([][]interface{}, len(results))
+	for i, _ := range mathResults {
+		mathResults[i] = make([]interface{}, len(fields)+1)
+		// put the time in
+		mathResults[i][0] = results[i][0]
+		for j, p := range processors {
+			mathResults[i][j+1] = p(results[i])
+		}
+	}
+
+	return mathResults
+}
+
+// ProcessAggregateDerivative returns the derivatives of an aggregate result set
+func ProcessAggregateDerivative(results [][]interface{}, isNonNegative bool, interval time.Duration) [][]interface{} {
+	// Return early if we can't calculate derivatives
+	if len(results) == 0 {
+		return results
+	}
+
+	// If we only have 1 value, then the value did not change, so return
+	// a single row w/ 0.0
+	if len(results) == 1 {
+		return [][]interface{}{
+			[]interface{}{results[0][0], 0.0},
+		}
+	}
+
+	// Check the value's type to ensure it's an numeric, if not, return a 0 result. We only check the first value
+	// because derivatives cannot be combined with other aggregates currently.
+	validType := false
+	switch results[0][1].(type) {
+	case int64:
+		validType = true
+	case float64:
+		validType = true
+	}
+
+	if !validType {
+		return [][]interface{}{
+			[]interface{}{results[0][0], 0.0},
+		}
+	}
+
+	// Otherwise calculate the derivatives as the difference between consecutive
+	// points divided by the elapsed time.  Then normalize to the requested
+	// interval.
+	derivatives := [][]interface{}{}
+	for i := 1; i < len(results); i++ {
+		prev := results[i-1]
+		cur := results[i]
+
+		if cur[1] == nil || prev[1] == nil {
+			continue
+		}
+
+		elapsed := cur[0].(time.Time).Sub(prev[0].(time.Time))
+		diff := int64toFloat64(cur[1]) - int64toFloat64(prev[1])
+		value := 0.0
+		if elapsed > 0 {
+			value = float64(diff) / (float64(elapsed) / float64(interval))
+		}
+
+		// Drop negative values for non-negative derivatives
+		if isNonNegative && diff < 0 {
+			continue
+		}
+
+		val := []interface{}{
+			cur[0],
+			value,
+		}
+		derivatives = append(derivatives, val)
+	}
+
+	return derivatives
+}
+
+// derivativeInterval returns the time interval for the one (and only) derivative func
+func derivativeInterval(stmt *influxql.SelectStatement) (time.Duration, error) {
+	if len(stmt.FunctionCalls()[0].Args) == 2 {
+		return stmt.FunctionCalls()[0].Args[1].(*influxql.DurationLiteral).Val, nil
+	}
+	interval, err := stmt.GroupByInterval()
+	if err != nil {
+		return 0, err
+	}
+	if interval > 0 {
+		return interval, nil
+	}
+	return time.Second, nil
+}
+
+// resultsEmpty will return true if the all the result values are empty or contain only nulls
+func resultsEmpty(resultValues [][]interface{}) bool {
+	for _, vals := range resultValues {
+		// start the loop at 1 because we want to skip over the time value
+		for i := 1; i < len(vals); i++ {
+			if vals[i] != nil {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+func int64toFloat64(v interface{}) float64 {
+	switch v.(type) {
+	case int64:
+		return float64(v.(int64))
+	case float64:
+		return v.(float64)
+	}
+	panic(fmt.Sprintf("expected either int64 or float64, got %v", v))
+}
+
+type int64arr []int64
+
+func (a int64arr) Len() int           { return len(a) }
+func (a int64arr) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a int64arr) Less(i, j int) bool { return a[i] < a[j] }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/executor_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/executor_test.go
@ -0,0 +1,991 @@
+package tsdb_test
+
+import (
+	"encoding/json"
+	"io/ioutil"
+	"math"
+	"os"
+	"testing"
+	"time"
+
+	"github.com/influxdb/influxdb/influxql"
+	"github.com/influxdb/influxdb/meta"
+	"github.com/influxdb/influxdb/tsdb"
+)
+
+var sID0 = uint64(1)
+var sID1 = uint64(2)
+var sgID1 = uint64(3)
+var sgID2 = uint64(4)
+var nID = uint64(42)
+
+// Simple test to ensure data can be read from two shards.
+func TestWritePointsAndExecuteTwoShards(t *testing.T) {
+	// Create the mock planner and its metastore
+	store, query_executor := testStoreAndQueryExecutor()
+	defer os.RemoveAll(store.Path())
+	query_executor.MetaStore = &testQEMetastore{
+		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
+			return []meta.ShardGroupInfo{
+				{
+					ID:        sgID,
+					StartTime: time.Now().Add(-time.Hour),
+					EndTime:   time.Now().Add(time.Hour),
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID0),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+				{
+					ID:        sgID,
+					StartTime: time.Now().Add(-2 * time.Hour),
+					EndTime:   time.Now().Add(-time.Hour),
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID1),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+			}, nil
+		},
+	}
+
+	// Write two points across shards.
+	pt1time := time.Unix(1, 0).UTC()
+	if err := store.WriteToShard(sID0, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverA", "region": "us-east"},
+		map[string]interface{}{"value": 100},
+		pt1time,
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	pt2time := time.Unix(2, 0).UTC()
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverB", "region": "us-east"},
+		map[string]interface{}{"value": 200},
+		pt2time,
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	var tests = []struct {
+		skip      bool   // Skip test
+		stmt      string // Query statement
+		chunkSize int    // Chunk size for driving the executor
+		expected  string // Expected results, rendered as a string
+	}{
+		{
+			stmt:     `SELECT value FROM cpu`,
+			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:      `SELECT value FROM cpu`,
+			chunkSize: 1,
+			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu LIMIT 1`,
+			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
+		},
+		{
+			stmt:      `SELECT value FROM cpu LIMIT 1`,
+			chunkSize: 2,
+			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu WHERE host='serverA'`,
+			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu WHERE host='serverB'`,
+			expected: `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu WHERE host='serverC'`,
+			expected: `null`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu GROUP BY host`,
+			expected: `[{"name":"cpu","tags":{"host":"serverA"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","tags":{"host":"serverB"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu GROUP BY region`,
+			expected: `[{"name":"cpu","tags":{"region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu GROUP BY host,region`,
+			expected: `[{"name":"cpu","tags":{"host":"serverA","region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","tags":{"host":"serverB","region":"us-east"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu WHERE host='serverA' GROUP BY host`,
+			expected: `[{"name":"cpu","tags":{"host":"serverA"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]}]`,
+		},
+
+		// Aggregate queries.
+		{
+			stmt:     `SELECT sum(value) FROM cpu`,
+			expected: `[{"name":"cpu","columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",300]]}]`,
+		},
+	}
+
+	for _, tt := range tests {
+		if tt.skip {
+			t.Logf("Skipping test %s", tt.stmt)
+			continue
+		}
+		executor, err := query_executor.Plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
+		if err != nil {
+			t.Fatalf("failed to plan query: %s", err.Error())
+		}
+		got := executeAndGetResults(executor)
+		if got != tt.expected {
+			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
+		}
+	}
+}
+
+// Test that executor correctly orders data across shards.
+func TestWritePointsAndExecuteTwoShardsAlign(t *testing.T) {
+	// Create the mock planner and its metastore
+	store, query_executor := testStoreAndQueryExecutor()
+	defer os.RemoveAll(store.Path())
+	query_executor.MetaStore = &testQEMetastore{
+		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
+			return []meta.ShardGroupInfo{
+				{
+					ID:        sgID,
+					StartTime: time.Now().Add(-2 * time.Hour),
+					EndTime:   time.Now().Add(-time.Hour),
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID1),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+				{
+					ID:        sgID,
+					StartTime: time.Now().Add(-2 * time.Hour),
+					EndTime:   time.Now().Add(time.Hour),
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID0),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+			}, nil
+		},
+	}
+
+	// Write interleaving, by time, chunks to the shards.
+	if err := store.WriteToShard(sID0, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverA"},
+		map[string]interface{}{"value": 100},
+		time.Unix(1, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverB"},
+		map[string]interface{}{"value": 200},
+		time.Unix(2, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverA"},
+		map[string]interface{}{"value": 300},
+		time.Unix(3, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	var tests = []struct {
+		skip      bool   // Skip test
+		stmt      string // Query statement
+		chunkSize int    // Chunk size for driving the executor
+		expected  string // Expected results, rendered as a string
+	}{
+		{
+			stmt:      `SELECT value FROM cpu`,
+			chunkSize: 1,
+			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:02Z",200]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:03Z",300]]}]`,
+		},
+		{
+			stmt:      `SELECT value FROM cpu`,
+			chunkSize: 2,
+			expected:  `[{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:02Z",200]]},{"name":"cpu","columns":["time","value"],"values":[["1970-01-01T00:00:03Z",300]]}]`,
+		},
+		{
+			stmt:      `SELECT mean(value),sum(value) FROM cpu`,
+			chunkSize: 2,
+			expected:  `[{"name":"cpu","columns":["time","mean","sum"],"values":[["1970-01-01T00:00:00Z",200,600]]}]`,
+		},
+	}
+
+	for _, tt := range tests {
+		if tt.skip {
+			t.Logf("Skipping test %s", tt.stmt)
+			continue
+		}
+		executor, err := query_executor.Plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
+		if err != nil {
+			t.Fatalf("failed to plan query: %s", err.Error())
+		}
+		got := executeAndGetResults(executor)
+		if got != tt.expected {
+			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
+		}
+	}
+}
+
+// Test to ensure the engine handles query re-writing across stores.
+func TestWritePointsAndExecuteTwoShardsQueryRewrite(t *testing.T) {
+	// Create two distinct stores, ensuring shard mappers will shard nothing.
+	store0 := testStore()
+	defer os.RemoveAll(store0.Path())
+	store1 := testStore()
+	defer os.RemoveAll(store1.Path())
+
+	// Create a shard in each store.
+	database := "foo"
+	retentionPolicy := "bar"
+	store0.CreateShard(database, retentionPolicy, sID0)
+	store1.CreateShard(database, retentionPolicy, sID1)
+
+	// Write two points across shards.
+	pt1time := time.Unix(1, 0).UTC()
+	if err := store0.WriteToShard(sID0, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverA"},
+		map[string]interface{}{"value1": 100},
+		pt1time,
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	pt2time := time.Unix(2, 0).UTC()
+	if err := store1.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "serverB"},
+		map[string]interface{}{"value2": 200},
+		pt2time,
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	var tests = []struct {
+		skip      bool   // Skip test
+		stmt      string // Query statement
+		chunkSize int    // Chunk size for driving the executor
+		expected  string // Expected results, rendered as a string
+	}{
+		{
+			stmt:     `SELECT * FROM cpu`,
+			expected: `[{"name":"cpu","columns":["time","host","value1","value2"],"values":[["1970-01-01T00:00:01Z","serverA",100,null],["1970-01-01T00:00:02Z","serverB",null,200]]}]`,
+		},
+		{
+			stmt:     `SELECT * FROM cpu GROUP BY *`,
+			expected: `[{"name":"cpu","tags":{"host":"serverA"},"columns":["time","value1","value2"],"values":[["1970-01-01T00:00:01Z",100,null]]},{"name":"cpu","tags":{"host":"serverB"},"columns":["time","value1","value2"],"values":[["1970-01-01T00:00:02Z",null,200]]}]`,
+		},
+	}
+	for _, tt := range tests {
+		if tt.skip {
+			t.Logf("Skipping test %s", tt.stmt)
+			continue
+		}
+
+		parsedSelectStmt := mustParseSelectStatement(tt.stmt)
+
+		// Create Mappers and Executor.
+		mapper0, err := store0.CreateMapper(sID0, tt.stmt, tt.chunkSize)
+		if err != nil {
+			t.Fatalf("failed to create mapper0: %s", err.Error())
+		}
+		mapper1, err := store1.CreateMapper(sID1, tt.stmt, tt.chunkSize)
+		if err != nil {
+			t.Fatalf("failed to create mapper1: %s", err.Error())
+		}
+		executor := tsdb.NewExecutor(parsedSelectStmt, []tsdb.Mapper{mapper0, mapper1}, tt.chunkSize)
+
+		// Check the results.
+		got := executeAndGetResults(executor)
+		if got != tt.expected {
+			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
+		}
+
+	}
+}
+
+// Test that executor correctly orders data across shards when the tagsets
+// are not presented in alphabetically order across shards.
+func TestWritePointsAndExecuteTwoShardsTagSetOrdering(t *testing.T) {
+	// Create the mock planner and its metastore
+	store, query_executor := testStoreAndQueryExecutor()
+	defer os.RemoveAll(store.Path())
+	query_executor.MetaStore = &testQEMetastore{
+		sgFunc: func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
+			return []meta.ShardGroupInfo{
+				{
+					ID: sgID,
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID0),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+				{
+					ID: sgID,
+					Shards: []meta.ShardInfo{
+						{
+							ID:       uint64(sID1),
+							OwnerIDs: []uint64{nID},
+						},
+					},
+				},
+			}, nil
+		},
+	}
+
+	// Write tagsets "y" and "z" to first shard.
+	if err := store.WriteToShard(sID0, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "y"},
+		map[string]interface{}{"value": 100},
+		time.Unix(1, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	if err := store.WriteToShard(sID0, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "z"},
+		map[string]interface{}{"value": 200},
+		time.Unix(1, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	// Write tagsets "x", y" and "z" to second shard.
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "x"},
+		map[string]interface{}{"value": 300},
+		time.Unix(2, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "y"},
+		map[string]interface{}{"value": 400},
+		time.Unix(3, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+	if err := store.WriteToShard(sID1, []tsdb.Point{tsdb.NewPoint(
+		"cpu",
+		map[string]string{"host": "z"},
+		map[string]interface{}{"value": 500},
+		time.Unix(3, 0).UTC(),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	var tests = []struct {
+		skip      bool   // Skip test
+		stmt      string // Query statement
+		chunkSize int    // Chunk size for driving the executor
+		expected  string // Expected results, rendered as a string
+	}{
+		{
+			stmt:     `SELECT sum(value) FROM cpu GROUP BY host`,
+			expected: `[{"name":"cpu","tags":{"host":"x"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",300]]},{"name":"cpu","tags":{"host":"y"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",500]]},{"name":"cpu","tags":{"host":"z"},"columns":["time","sum"],"values":[["1970-01-01T00:00:00Z",700]]}]`,
+		},
+		{
+			stmt:     `SELECT value FROM cpu GROUP BY host`,
+			expected: `[{"name":"cpu","tags":{"host":"x"},"columns":["time","value"],"values":[["1970-01-01T00:00:02Z",300]]},{"name":"cpu","tags":{"host":"y"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",100],["1970-01-01T00:00:03Z",400]]},{"name":"cpu","tags":{"host":"z"},"columns":["time","value"],"values":[["1970-01-01T00:00:01Z",200],["1970-01-01T00:00:03Z",500]]}]`,
+		},
+	}
+
+	for _, tt := range tests {
+		if tt.skip {
+			t.Logf("Skipping test %s", tt.stmt)
+			continue
+		}
+		executor, err := query_executor.Plan(mustParseSelectStatement(tt.stmt), tt.chunkSize)
+		if err != nil {
+			t.Fatalf("failed to plan query: %s", err.Error())
+		}
+		got := executeAndGetResults(executor)
+		if got != tt.expected {
+			t.Fatalf("Test %s\nexp: %s\ngot: %s\n", tt.stmt, tt.expected, got)
+		}
+	}
+}
+
+// TestProccessAggregateDerivative tests the RawQueryDerivativeProcessor transformation function on the engine.
+// The is called for a query with a GROUP BY.
+func TestProcessAggregateDerivative(t *testing.T) {
+	tests := []struct {
+		name     string
+		fn       string
+		interval time.Duration
+		in       [][]interface{}
+		exp      [][]interface{}
+	}{
+		{
+			name:     "empty input",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in:       [][]interface{}{},
+			exp:      [][]interface{}{},
+		},
+
+		{
+			name:     "single row returns 0.0",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 0.0,
+				},
+			},
+		},
+		{
+			name:     "basic derivative",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 3.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 5.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 9.0,
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+		},
+		{
+			name:     "12h interval",
+			fn:       "derivative",
+			interval: 12 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 3.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 0.5,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 0.5,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 0.5,
+				},
+			},
+		},
+		{
+			name:     "negative derivatives",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 0.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), -2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+		},
+		{
+			name:     "negative derivatives",
+			fn:       "non_negative_derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
+				},
+				// Show resultes in negative derivative
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 0.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+		},
+		{
+			name:     "integer derivatives",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 1.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), int64(3),
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), int64(5),
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), int64(9),
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), 2.0,
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), 4.0,
+				},
+			},
+		},
+		{
+			name:     "string derivatives",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), "1.0",
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(24 * time.Hour), "2.0",
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(48 * time.Hour), "3.0",
+				},
+				[]interface{}{
+					time.Unix(0, 0).Add(72 * time.Hour), "4.0",
+				},
+			},
+			exp: [][]interface{}{
+				[]interface{}{
+					time.Unix(0, 0), 0.0,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		got := tsdb.ProcessAggregateDerivative(test.in, test.fn == "non_negative_derivative", test.interval)
+
+		if len(got) != len(test.exp) {
+			t.Fatalf("ProcessAggregateDerivative(%s) - %s\nlen mismatch: got %d, exp %d", test.fn, test.name, len(got), len(test.exp))
+		}
+
+		for i := 0; i < len(test.exp); i++ {
+			if test.exp[i][0] != got[i][0] || test.exp[i][1] != got[i][1] {
+				t.Fatalf("ProcessAggregateDerivative - %s results mismatch:\ngot %v\nexp %v", test.name, got, test.exp)
+			}
+		}
+	}
+}
+
+// TestProcessRawQueryDerivative tests the RawQueryDerivativeProcessor transformation function on the engine.
+// The is called for a queries that do not have a group by.
+func TestProcessRawQueryDerivative(t *testing.T) {
+	tests := []struct {
+		name     string
+		fn       string
+		interval time.Duration
+		in       []*tsdb.MapperValue
+		exp      []*tsdb.MapperValue
+	}{
+		{
+			name:     "empty input",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in:       []*tsdb.MapperValue{},
+			exp:      []*tsdb.MapperValue{},
+		},
+
+		{
+			name:     "single row returns 0.0",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 1.0,
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 0.0,
+				},
+			},
+		},
+		{
+			name:     "basic derivative",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 0.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 3.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 5.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 9.0,
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 3.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 2.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+		},
+		{
+			name:     "integer derivative",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: int64(0),
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: int64(3),
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: int64(5),
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: int64(9),
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 3.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 2.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+		},
+		{
+			name:     "12h interval",
+			fn:       "derivative",
+			interval: 12 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).UnixNano(),
+					Value: 1.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 2.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 3.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 0.5,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 0.5,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 0.5,
+				},
+			},
+		},
+		{
+			name:     "negative derivatives",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 1.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 2.0,
+				},
+				// should go negative
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 0.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 1.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: -2.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+		},
+		{
+			name:     "negative derivatives",
+			fn:       "non_negative_derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 1.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 2.0,
+				},
+				// should go negative
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: 0.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: 1.0,
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: 4.0,
+				},
+			},
+		},
+		{
+			name:     "string derivatives",
+			fn:       "derivative",
+			interval: 24 * time.Hour,
+			in: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: "1.0",
+				},
+				{
+					Time:  time.Unix(0, 0).Add(24 * time.Hour).UnixNano(),
+					Value: "2.0",
+				},
+				{
+					Time:  time.Unix(0, 0).Add(48 * time.Hour).UnixNano(),
+					Value: "3.0",
+				},
+				{
+					Time:  time.Unix(0, 0).Add(72 * time.Hour).UnixNano(),
+					Value: "4.0",
+				},
+			},
+			exp: []*tsdb.MapperValue{
+				{
+					Time:  time.Unix(0, 0).Unix(),
+					Value: 0.0,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		p := tsdb.RawQueryDerivativeProcessor{
+			IsNonNegative:      test.fn == "non_negative_derivative",
+			DerivativeInterval: test.interval,
+		}
+		got := p.Process(test.in)
+
+		if len(got) != len(test.exp) {
+			t.Fatalf("RawQueryDerivativeProcessor(%s) - %s\nlen mismatch: got %d, exp %d", test.fn, test.name, len(got), len(test.exp))
+		}
+
+		for i := 0; i < len(test.exp); i++ {
+			if test.exp[i].Time != got[i].Time || math.Abs((test.exp[i].Value.(float64)-got[i].Value.(float64))) > 0.0000001 {
+				t.Fatalf("RawQueryDerivativeProcessor - %s results mismatch:\ngot %v\nexp %v", test.name, got, test.exp)
+			}
+		}
+	}
+}
+
+type testQEMetastore struct {
+	sgFunc func(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error)
+}
+
+func (t *testQEMetastore) ShardGroupsByTimeRange(database, policy string, min, max time.Time) (a []meta.ShardGroupInfo, err error) {
+	return t.sgFunc(database, policy, min, max)
+}
+
+func (t *testQEMetastore) Database(name string) (*meta.DatabaseInfo, error) { return nil, nil }
+func (t *testQEMetastore) Databases() ([]meta.DatabaseInfo, error)          { return nil, nil }
+func (t *testQEMetastore) User(name string) (*meta.UserInfo, error)         { return nil, nil }
+func (t *testQEMetastore) AdminUserExists() (bool, error)                   { return false, nil }
+func (t *testQEMetastore) Authenticate(username, password string) (*meta.UserInfo, error) {
+	return nil, nil
+}
+func (t *testQEMetastore) RetentionPolicy(database, name string) (rpi *meta.RetentionPolicyInfo, err error) {
+	return nil, nil
+}
+func (t *testQEMetastore) UserCount() (int, error) { return 0, nil }
+
+func (t *testQEMetastore) NodeID() uint64 { return nID }
+
+func testStore() *tsdb.Store {
+	path, _ := ioutil.TempDir("", "")
+
+	store := tsdb.NewStore(path)
+	err := store.Open()
+	if err != nil {
+		panic(err)
+	}
+	return store
+}
+
+func testStoreAndQueryExecutor() (*tsdb.Store, *tsdb.QueryExecutor) {
+	store := testStore()
+	database := "foo"
+	retentionPolicy := "bar"
+	store.CreateShard(database, retentionPolicy, sID0)
+	store.CreateShard(database, retentionPolicy, sID1)
+
+	query_executor := tsdb.NewQueryExecutor(store)
+	query_executor.ShardMapper = &testQEShardMapper{store}
+
+	return store, query_executor
+}
+
+type testQEShardMapper struct {
+	store *tsdb.Store
+}
+
+func (t *testQEShardMapper) CreateMapper(shard meta.ShardInfo, stmt string, chunkSize int) (tsdb.Mapper, error) {
+	return t.store.CreateMapper(shard.ID, stmt, chunkSize)
+}
+
+func executeAndGetResults(executor *tsdb.Executor) string {
+	ch := executor.Execute()
+
+	var rows []*influxql.Row
+	for r := range ch {
+		rows = append(rows, r)
+	}
+
+	b, err := json.Marshal(rows)
+	if err != nil {
+		panic(err)
+	}
+	return string(b)
+}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/internal/meta.pb.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/internal/meta.pb.go
@ -1,12 +1,12 @@
-// Code generated by protoc-gen-go.
-// source: meta.proto
+// Code generated by protoc-gen-gogo.
+// source: internal/meta.proto
 // DO NOT EDIT!

 /*
 Package internal is a generated protocol buffer package.

 It is generated from these files:
-	meta.proto
+	internal/meta.proto

 It has these top-level messages:
 	Series
@ -16,7 +16,7 @@ It has these top-level messages:
 */
 package internal

-import proto "github.com/golang/protobuf/proto"
+import proto "github.com/gogo/protobuf/proto"
 import math "math"

 // Reference imports to suppress errors if they are not otherwise used.
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/mapper.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/mapper.go
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/mapper_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/mapper_test.go
@ -1,4 +1,4 @@
-package tsdb
+package tsdb_test

 import (
 	"encoding/json"
@ -12,92 +12,119 @@ import (
 	"time"

 	"github.com/influxdb/influxdb/influxql"
+	"github.com/influxdb/influxdb/tsdb"
 )

-func TestShardMapper_RawMapperTagSets(t *testing.T) {
+func TestShardMapper_RawMapperTagSetsFields(t *testing.T) {
 	tmpDir, _ := ioutil.TempDir("", "shard_test")
 	defer os.RemoveAll(tmpDir)
 	shard := mustCreateShard(tmpDir)

 	pt1time := time.Unix(1, 0).UTC()
-	pt1 := NewPoint(
+	pt1 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverA", "region": "us-east"},
-		map[string]interface{}{"value": 42},
+		map[string]interface{}{"idle": 60},
 		pt1time,
 	)
 	pt2time := time.Unix(2, 0).UTC()
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverB", "region": "us-east"},
-		map[string]interface{}{"value": 60},
+		map[string]interface{}{"load": 60},
 		pt2time,
 	)
-	err := shard.WritePoints([]Point{pt1, pt2})
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

 	var tests = []struct {
-		stmt     string
-		expected []string
+		stmt           string
+		expectedTags   []string
+		expectedFields []string
 	}{
 		{
-			stmt:     `SELECT value FROM cpu`,
-			expected: []string{"cpu"},
+			stmt:           `SELECT load FROM cpu`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"load"},
 		},
 		{
-			stmt:     `SELECT value FROM cpu GROUP BY host`,
-			expected: []string{"cpu|host|serverA", "cpu|host|serverB"},
+			stmt:           `SELECT derivative(load) FROM cpu`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"load"},
 		},
 		{
-			stmt:     `SELECT value FROM cpu GROUP BY region`,
-			expected: []string{"cpu|region|us-east"},
+			stmt:           `SELECT idle,load FROM cpu`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"idle", "load"},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverA'`,
-			expected: []string{"cpu"},
+			stmt:           `SELECT load,idle FROM cpu`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"idle", "load"},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverB'`,
-			expected: []string{"cpu"},
+			stmt:           `SELECT load FROM cpu GROUP BY host`,
+			expectedTags:   []string{"cpu|host|serverA", "cpu|host|serverB"},
+			expectedFields: []string{"load"},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverC'`,
-			expected: []string{},
+			stmt:           `SELECT load FROM cpu GROUP BY region`,
+			expectedTags:   []string{"cpu|region|us-east"},
+			expectedFields: []string{"load"},
+		},
+		{
+			stmt:           `SELECT load FROM cpu WHERE host='serverA'`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"load"},
+		},
+		{
+			stmt:           `SELECT load FROM cpu WHERE host='serverB'`,
+			expectedTags:   []string{"cpu"},
+			expectedFields: []string{"load"},
+		},
+		{
+			stmt:           `SELECT load FROM cpu WHERE host='serverC'`,
+			expectedTags:   []string{},
+			expectedFields: []string{"load"},
 		},
 	}

 	for _, tt := range tests {
 		stmt := mustParseSelectStatement(tt.stmt)
 		mapper := openRawMapperOrFail(t, shard, stmt, 0)
-		got := mapper.TagSets()
-		if !reflect.DeepEqual(got, tt.expected) {
-			t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected)
+		tags := mapper.TagSets()
+		if !reflect.DeepEqual(tags, tt.expectedTags) {
+			t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, tags, tt.expectedTags)
+		}
+		fields := mapper.Fields()
+		if !reflect.DeepEqual(fields, tt.expectedFields) {
+			t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, fields, tt.expectedFields)
 		}
 	}
 }

-func TestShardMapper_WriteAndSingleMapperRawQuery(t *testing.T) {
+func TestShardMapper_WriteAndSingleMapperRawQuerySingleValue(t *testing.T) {
 	tmpDir, _ := ioutil.TempDir("", "shard_test")
 	defer os.RemoveAll(tmpDir)
 	shard := mustCreateShard(tmpDir)

 	pt1time := time.Unix(1, 0).UTC()
-	pt1 := NewPoint(
+	pt1 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverA", "region": "us-east"},
-		map[string]interface{}{"value": 42},
+		map[string]interface{}{"load": 42},
 		pt1time,
 	)
 	pt2time := time.Unix(2, 0).UTC()
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverB", "region": "us-east"},
-		map[string]interface{}{"value": 60},
+		map[string]interface{}{"load": 60},
 		pt2time,
 	)
-	err := shard.WritePoints([]Point{pt1, pt2})
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@ -108,62 +135,65 @@ func TestShardMapper_WriteAndSingleMapperRawQuery(t *testing.T) {
 		expected  []string
 	}{
 		{
-			stmt:     `SELECT value FROM cpu`,
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":42},{"time":2000000000,"value":60}]}`, `null`},
+			stmt:     `SELECT load FROM cpu`,
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`, `null`},
 		},
 		{
-			stmt:      `SELECT value FROM cpu`,
+			stmt:      `SELECT load FROM cpu # chunkSize 1`,
 			chunkSize: 1,
-			expected:  []string{`{"name":"cpu","values":[{"time":1000000000,"value":42}]}`, `{"name":"cpu","values":[{"time":2000000000,"value":60}]}`, `null`},
+			expected:  []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
 		},
 		{
-			stmt:      `SELECT value FROM cpu`,
+			stmt:      `SELECT load FROM cpu # chunkSize 2`,
 			chunkSize: 2,
-			expected:  []string{`{"name":"cpu","values":[{"time":1000000000,"value":42},{"time":2000000000,"value":60}]}`},
+			expected:  []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:      `SELECT value FROM cpu`,
+			stmt:      `SELECT load FROM cpu # chunkSize 3`,
 			chunkSize: 3,
-			expected:  []string{`{"name":"cpu","values":[{"time":1000000000,"value":42},{"time":2000000000,"value":60}]}`},
+			expected:  []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu GROUP BY host`,
-			expected: []string{`{"name":"cpu","tags":{"host":"serverA"},"values":[{"time":1000000000,"value":42}]}`, `{"name":"cpu","tags":{"host":"serverB"},"values":[{"time":2000000000,"value":60}]}`, `null`},
+			stmt: `SELECT load FROM cpu GROUP BY host`,
+			expected: []string{
+				`{"name":"cpu","tags":{"host":"serverA"},"fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`,
+				`{"name":"cpu","tags":{"host":"serverB"},"fields":["load"],"values":[{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`,
+			},
 		},
 		{
-			stmt:     `SELECT value FROM cpu GROUP BY region`,
-			expected: []string{`{"name":"cpu","tags":{"region":"us-east"},"values":[{"time":1000000000,"value":42},{"time":2000000000,"value":60}]}`, `null`},
+			stmt:     `SELECT load FROM cpu GROUP BY region`,
+			expected: []string{`{"name":"cpu","tags":{"region":"us-east"},"fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverA'`,
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":42}]}`, `null`},
+			stmt:     `SELECT load FROM cpu WHERE host='serverA'`,
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverB'`,
-			expected: []string{`{"name":"cpu","values":[{"time":2000000000,"value":60}]}`, `null`},
+			stmt:     `SELECT load FROM cpu WHERE host='serverB'`,
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE host='serverC'`,
+			stmt:     `SELECT load FROM cpu WHERE host='serverC'`,
 			expected: []string{`null`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE value = 60`,
-			expected: []string{`{"name":"cpu","values":[{"time":2000000000,"value":60}]}`, `null`},
+			stmt:     `SELECT load FROM cpu WHERE load = 60`,
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     `SELECT value FROM cpu WHERE value != 60`,
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":42}]}`, `null`},
+			stmt:     `SELECT load FROM cpu WHERE load != 60`,
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     fmt.Sprintf(`SELECT value FROM cpu WHERE time = '%s'`, pt1time.Format(influxql.DateTimeFormat)),
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":42}]}`, `null`},
+			stmt:     fmt.Sprintf(`SELECT load FROM cpu WHERE time = '%s'`, pt1time.Format(influxql.DateTimeFormat)),
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     fmt.Sprintf(`SELECT value FROM cpu WHERE time > '%s'`, pt1time.Format(influxql.DateTimeFormat)),
-			expected: []string{`{"name":"cpu","values":[{"time":2000000000,"value":60}]}`, `null`},
+			stmt:     fmt.Sprintf(`SELECT load FROM cpu WHERE time > '%s'`, pt1time.Format(influxql.DateTimeFormat)),
+			expected: []string{`{"name":"cpu","fields":["load"],"values":[{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
-			stmt:     fmt.Sprintf(`SELECT value FROM cpu WHERE time > '%s'`, pt2time.Format(influxql.DateTimeFormat)),
+			stmt:     fmt.Sprintf(`SELECT load FROM cpu WHERE time > '%s'`, pt2time.Format(influxql.DateTimeFormat)),
 			expected: []string{`null`},
 		},
 	}
@ -188,20 +218,20 @@ func TestShardMapper_WriteAndSingleMapperRawQueryMultiValue(t *testing.T) {
 	shard := mustCreateShard(tmpDir)

 	pt1time := time.Unix(1, 0).UTC()
-	pt1 := NewPoint(
+	pt1 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverA", "region": "us-east"},
 		map[string]interface{}{"foo": 42, "bar": 43},
 		pt1time,
 	)
 	pt2time := time.Unix(2, 0).UTC()
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverB", "region": "us-east"},
 		map[string]interface{}{"foo": 60, "bar": 61},
 		pt2time,
 	)
-	err := shard.WritePoints([]Point{pt1, pt2})
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@ -213,11 +243,11 @@ func TestShardMapper_WriteAndSingleMapperRawQueryMultiValue(t *testing.T) {
 	}{
 		{
 			stmt:     `SELECT foo FROM cpu`,
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":42},{"time":2000000000,"value":60}]}`, `null`},
+			expected: []string{`{"name":"cpu","fields":["foo"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 		{
 			stmt:     `SELECT foo,bar FROM cpu`,
-			expected: []string{`{"name":"cpu","values":[{"time":1000000000,"value":{"bar":43,"foo":42}},{"time":2000000000,"value":{"bar":61,"foo":60}}]}`, `null`},
+			expected: []string{`{"name":"cpu","fields":["bar","foo"],"values":[{"time":1000000000,"value":{"bar":43,"foo":42},"tags":{"host":"serverA","region":"us-east"}},{"time":2000000000,"value":{"bar":61,"foo":60},"tags":{"host":"serverB","region":"us-east"}}]}`},
 		},
 	}

@ -225,10 +255,75 @@ func TestShardMapper_WriteAndSingleMapperRawQueryMultiValue(t *testing.T) {
 		stmt := mustParseSelectStatement(tt.stmt)
 		mapper := openRawMapperOrFail(t, shard, stmt, tt.chunkSize)

-		for _, s := range tt.expected {
+		for i, s := range tt.expected {
 			got := nextRawChunkAsJson(t, mapper)
 			if got != s {
-				t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected)
+				t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected[i])
+				break
+			}
+		}
+	}
+}
+
+func TestShardMapper_WriteAndSingleMapperRawQueryMultiSource(t *testing.T) {
+	tmpDir, _ := ioutil.TempDir("", "shard_test")
+	defer os.RemoveAll(tmpDir)
+	shard := mustCreateShard(tmpDir)
+
+	pt1time := time.Unix(1, 0).UTC()
+	pt1 := tsdb.NewPoint(
+		"cpu0",
+		map[string]string{"host": "serverA", "region": "us-east"},
+		map[string]interface{}{"foo": 42},
+		pt1time,
+	)
+	pt2time := time.Unix(2, 0).UTC()
+	pt2 := tsdb.NewPoint(
+		"cpu1",
+		map[string]string{"host": "serverB", "region": "us-east"},
+		map[string]interface{}{"bar": 60},
+		pt2time,
+	)
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
+	if err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	var tests = []struct {
+		stmt      string
+		chunkSize int
+		expected  []string
+	}{
+		{
+			stmt:     `SELECT foo FROM cpu0,cpu1`,
+			expected: []string{`{"name":"cpu0","fields":["foo"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
+		},
+		{
+			stmt:     `SELECT foo FROM cpu0,cpu1 WHERE foo=42`,
+			expected: []string{`{"name":"cpu0","fields":["foo"],"values":[{"time":1000000000,"value":42,"tags":{"host":"serverA","region":"us-east"}}]}`},
+		},
+		{
+			stmt:     `SELECT bar FROM cpu0,cpu1`,
+			expected: []string{`{"name":"cpu1","fields":["bar"],"values":[{"time":2000000000,"value":60,"tags":{"host":"serverB","region":"us-east"}}]}`},
+		},
+		{
+			stmt:     `SELECT bar FROM cpu0,cpu1 WHERE foo=42`,
+			expected: []string{`null`},
+		},
+		{
+			stmt:     `SELECT bar FROM cpu0,cpu1 WHERE bar!=60`,
+			expected: []string{`null`},
+		},
+	}
+
+	for _, tt := range tests {
+		stmt := mustParseSelectStatement(tt.stmt)
+		mapper := openRawMapperOrFail(t, shard, stmt, tt.chunkSize)
+
+		for i, s := range tt.expected {
+			got := nextRawChunkAsJson(t, mapper)
+			if got != s {
+				t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected[i])
 				break
 			}
 		}
@ -241,20 +336,20 @@ func TestShardMapper_WriteAndSingleMapperAggregateQuery(t *testing.T) {
 	shard := mustCreateShard(tmpDir)

 	pt1time := time.Unix(10, 0).UTC()
-	pt1 := NewPoint(
+	pt1 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverA", "region": "us-east"},
 		map[string]interface{}{"value": 1},
 		pt1time,
 	)
 	pt2time := time.Unix(20, 0).UTC()
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverB", "region": "us-east"},
 		map[string]interface{}{"value": 60},
 		pt2time,
 	)
-	err := shard.WritePoints([]Point{pt1, pt2})
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@ -265,92 +360,92 @@ func TestShardMapper_WriteAndSingleMapperAggregateQuery(t *testing.T) {
 	}{
 		{
 			stmt:     `SELECT sum(value) FROM cpu`,
-			expected: []string{`{"name":"cpu","values":[{"value":[61]}]}`, `null`},
+			expected: []string{`{"name":"cpu","fields":["value"],"values":[{"value":[61]}]}`, `null`},
 		},
 		{
 			stmt:     `SELECT sum(value),mean(value) FROM cpu`,
-			expected: []string{`{"name":"cpu","values":[{"value":[61,{"Count":2,"Mean":30.5,"ResultType":1}]}]}`, `null`},
+			expected: []string{`{"name":"cpu","fields":["value"],"values":[{"value":[61,{"Count":2,"Mean":30.5,"ResultType":1}]}]}`, `null`},
 		},
 		{
 			stmt: `SELECT sum(value) FROM cpu GROUP BY host`,
 			expected: []string{
-				`{"name":"cpu","tags":{"host":"serverA"},"values":[{"value":[1]}]}`,
-				`{"name":"cpu","tags":{"host":"serverB"},"values":[{"value":[60]}]}`,
+				`{"name":"cpu","tags":{"host":"serverA"},"fields":["value"],"values":[{"value":[1]}]}`,
+				`{"name":"cpu","tags":{"host":"serverB"},"fields":["value"],"values":[{"value":[60]}]}`,
 				`null`},
 		},
 		{
 			stmt: `SELECT sum(value) FROM cpu GROUP BY region`,
 			expected: []string{
-				`{"name":"cpu","tags":{"region":"us-east"},"values":[{"value":[61]}]}`,
+				`{"name":"cpu","tags":{"region":"us-east"},"fields":["value"],"values":[{"value":[61]}]}`,
 				`null`},
 		},
 		{
 			stmt: `SELECT sum(value) FROM cpu GROUP BY region,host`,
 			expected: []string{
-				`{"name":"cpu","tags":{"host":"serverA","region":"us-east"},"values":[{"value":[1]}]}`,
-				`{"name":"cpu","tags":{"host":"serverB","region":"us-east"},"values":[{"value":[60]}]}`,
+				`{"name":"cpu","tags":{"host":"serverA","region":"us-east"},"fields":["value"],"values":[{"value":[1]}]}`,
+				`{"name":"cpu","tags":{"host":"serverB","region":"us-east"},"fields":["value"],"values":[{"value":[60]}]}`,
 				`null`},
 		},
 		{
 			stmt: `SELECT sum(value) FROM cpu WHERE host='serverB'`,
 			expected: []string{
-				`{"name":"cpu","values":[{"value":[60]}]}`,
+				`{"name":"cpu","fields":["value"],"values":[{"value":[60]}]}`,
 				`null`},
 		},
 		{
 			stmt: fmt.Sprintf(`SELECT sum(value) FROM cpu WHERE time = '%s'`, pt1time.Format(influxql.DateTimeFormat)),
 			expected: []string{
-				`{"name":"cpu","values":[{"time":10000000000,"value":[1]}]}`,
+				`{"name":"cpu","fields":["value"],"values":[{"time":10000000000,"value":[1]}]}`,
 				`null`},
 		},
 		{
 			stmt: fmt.Sprintf(`SELECT sum(value) FROM cpu WHERE time > '%s'`, pt1time.Format(influxql.DateTimeFormat)),
 			expected: []string{
-				`{"name":"cpu","values":[{"time":10000000001,"value":[60]}]}`,
+				`{"name":"cpu","fields":["value"],"values":[{"time":10000000001,"value":[60]}]}`,
 				`null`},
 		},
 		{
 			stmt: fmt.Sprintf(`SELECT sum(value) FROM cpu WHERE time > '%s'`, pt2time.Format(influxql.DateTimeFormat)),
 			expected: []string{
-				`{"name":"cpu","values":[{"time":20000000001,"value":[null]}]}`,
+				`{"name":"cpu","fields":["value"],"values":[{"time":20000000001,"value":[null]}]}`,
 				`null`},
 		},
 	}

 	for _, tt := range tests {
 		stmt := mustParseSelectStatement(tt.stmt)
-		mapper := openAggMapperOrFail(t, shard, stmt)
+		mapper := openLocalMapperOrFail(t, shard, stmt)

 		for i := range tt.expected {
 			got := aggIntervalAsJson(t, mapper)
 			if got != tt.expected[i] {
-				t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected[i])
+				t.Fatalf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected[i])
 				break
 			}
 		}
 	}
 }

-func TestShardMapper_AggMapperTagSets(t *testing.T) {
+func TestShardMapper_LocalMapperTagSets(t *testing.T) {
 	tmpDir, _ := ioutil.TempDir("", "shard_test")
 	defer os.RemoveAll(tmpDir)
 	shard := mustCreateShard(tmpDir)

 	pt1time := time.Unix(1, 0).UTC()
-	pt1 := NewPoint(
+	pt1 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverA", "region": "us-east"},
 		map[string]interface{}{"value": 42},
 		pt1time,
 	)
 	pt2time := time.Unix(2, 0).UTC()
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "serverB", "region": "us-east"},
 		map[string]interface{}{"value": 60},
 		pt2time,
 	)
-	err := shard.WritePoints([]Point{pt1, pt2})
+	err := shard.WritePoints([]tsdb.Point{pt1, pt2})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@ -387,7 +482,7 @@ func TestShardMapper_AggMapperTagSets(t *testing.T) {

 	for _, tt := range tests {
 		stmt := mustParseSelectStatement(tt.stmt)
-		mapper := openAggMapperOrFail(t, shard, stmt)
+		mapper := openLocalMapperOrFail(t, shard, stmt)
 		got := mapper.TagSets()
 		if !reflect.DeepEqual(got, tt.expected) {
 			t.Errorf("test '%s'\n\tgot      %s\n\texpected %s", tt.stmt, got, tt.expected)
@ -396,10 +491,10 @@ func TestShardMapper_AggMapperTagSets(t *testing.T) {

 }

-func mustCreateShard(dir string) *Shard {
+func mustCreateShard(dir string) *tsdb.Shard {
 	tmpShard := path.Join(dir, "shard")
-	index := NewDatabaseIndex()
-	sh := NewShard(index, tmpShard)
+	index := tsdb.NewDatabaseIndex()
+	sh := tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 	if err := sh.Open(); err != nil {
 		panic(fmt.Sprintf("error opening shard: %s", err.Error()))
 	}
@ -415,8 +510,8 @@ func mustParseSelectStatement(s string) *influxql.SelectStatement {
 	return stmt.(*influxql.SelectStatement)
 }

-func openRawMapperOrFail(t *testing.T, shard *Shard, stmt *influxql.SelectStatement, chunkSize int) *RawMapper {
-	mapper := NewRawMapper(shard, stmt, chunkSize)
+func openRawMapperOrFail(t *testing.T, shard *tsdb.Shard, stmt *influxql.SelectStatement, chunkSize int) tsdb.Mapper {
+	mapper := tsdb.NewLocalMapper(shard, stmt, chunkSize)

 	if err := mapper.Open(); err != nil {
 		t.Fatalf("failed to open raw mapper: %s", err.Error())
@ -424,7 +519,7 @@ func openRawMapperOrFail(t *testing.T, shard *Shard, stmt *influxql.SelectStatem
 	return mapper
 }

-func nextRawChunkAsJson(t *testing.T, mapper *RawMapper) string {
+func nextRawChunkAsJson(t *testing.T, mapper tsdb.Mapper) string {
 	r, err := mapper.NextChunk()
 	if err != nil {
 		t.Fatalf("failed to get next chunk from mapper: %s", err.Error())
@ -436,8 +531,8 @@ func nextRawChunkAsJson(t *testing.T, mapper *RawMapper) string {
 	return string(b)
 }

-func openAggMapperOrFail(t *testing.T, shard *Shard, stmt *influxql.SelectStatement) *AggMapper {
-	mapper := NewAggMapper(shard, stmt)
+func openLocalMapperOrFail(t *testing.T, shard *tsdb.Shard, stmt *influxql.SelectStatement) *tsdb.LocalMapper {
+	mapper := tsdb.NewLocalMapper(shard, stmt, 0)

 	if err := mapper.Open(); err != nil {
 		t.Fatalf("failed to open aggregate mapper: %s", err.Error())
@ -445,7 +540,7 @@ func openAggMapperOrFail(t *testing.T, shard *Shard, stmt *influxql.SelectStatem
 	return mapper
 }

-func aggIntervalAsJson(t *testing.T, mapper *AggMapper) string {
+func aggIntervalAsJson(t *testing.T, mapper *tsdb.LocalMapper) string {
 	r, err := mapper.NextChunk()
 	if err != nil {
 		t.Fatalf("failed to get chunk from aggregate mapper: %s", err.Error())
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/meta.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/meta.go
@ -39,6 +39,27 @@ func NewDatabaseIndex() *DatabaseIndex {
 	}
 }

+// Names returns a sorted list of measurement names.
+func (d *DatabaseIndex) Names() []string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.names
+}
+
+// Series returns a series by key.
+func (d *DatabaseIndex) Series(key string) *Series {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.series[key]
+}
+
+// SeriesN returns the number of series.
+func (d *DatabaseIndex) SeriesN() int {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return len(d.series)
+}
+
 // Measurement returns the measurement object from the index by the name
 func (d *DatabaseIndex) Measurement(name string) *Measurement {
 	d.mu.RLock()
@ -55,8 +76,8 @@ func (d *DatabaseIndex) MeasurementSeriesCounts() (nMeasurements int, nSeries in
 	return
 }

-// createSeriesIndexIfNotExists adds the series for the given measurement to the index and sets its ID or returns the existing series object
-func (s *DatabaseIndex) createSeriesIndexIfNotExists(measurementName string, series *Series) *Series {
+// CreateSeriesIndexIfNotExists adds the series for the given measurement to the index and sets its ID or returns the existing series object
+func (s *DatabaseIndex) CreateSeriesIndexIfNotExists(measurementName string, series *Series) *Series {
 	// if there is a measurement for this id, it's already been added
 	ss := s.series[series.Key]
 	if ss != nil {
@ -64,7 +85,7 @@ func (s *DatabaseIndex) createSeriesIndexIfNotExists(measurementName string, ser
 	}

 	// get or create the measurement index
-	m := s.createMeasurementIndexIfNotExists(measurementName)
+	m := s.CreateMeasurementIndexIfNotExists(measurementName)

 	// set the in memory ID for query processing on this shard
 	series.id = s.lastID + 1
@ -78,8 +99,8 @@ func (s *DatabaseIndex) createSeriesIndexIfNotExists(measurementName string, ser
 	return series
 }

-// createMeasurementIndexIfNotExists creates or retrieves an in memory index object for the measurement
-func (s *DatabaseIndex) createMeasurementIndexIfNotExists(name string) *Measurement {
+// CreateMeasurementIndexIfNotExists creates or retrieves an in memory index object for the measurement
+func (s *DatabaseIndex) CreateMeasurementIndexIfNotExists(name string) *Measurement {
 	name = unescapeString(name)
 	m := s.measurements[name]
 	if m == nil {
@ -263,6 +284,7 @@ func (db *DatabaseIndex) DropSeries(keys []string) {
 			continue
 		}
 		series.measurement.DropSeries(series.id)
+		delete(db.series, k)
 	}
 }

@ -276,11 +298,10 @@ type Measurement struct {
 	index      *DatabaseIndex

 	// in-memory index fields
-	series              map[string]*Series // sorted tagset string to the series object
 	seriesByID          map[uint64]*Series // lookup table for series by their id
 	measurement         *Measurement
-	seriesByTagKeyValue map[string]map[string]seriesIDs // map from tag key to value to sorted set of series ids
-	seriesIDs           seriesIDs                       // sorted list of series IDs in this measurement
+	seriesByTagKeyValue map[string]map[string]SeriesIDs // map from tag key to value to sorted set of series ids
+	seriesIDs           SeriesIDs                       // sorted list of series IDs in this measurement
 }

 // NewMeasurement allocates and initializes a new Measurement.
@ -290,10 +311,9 @@ func NewMeasurement(name string, idx *DatabaseIndex) *Measurement {
 		fieldNames: make(map[string]struct{}),
 		index:      idx,

-		series:              make(map[string]*Series),
 		seriesByID:          make(map[uint64]*Series),
-		seriesByTagKeyValue: make(map[string]map[string]seriesIDs),
-		seriesIDs:           make(seriesIDs, 0),
+		seriesByTagKeyValue: make(map[string]map[string]SeriesIDs),
+		seriesIDs:           make(SeriesIDs, 0),
 	}
 }

@ -305,6 +325,13 @@ func (m *Measurement) HasField(name string) bool {
 	return hasField
 }

+// SeriesByID returns a series by identifier.
+func (m *Measurement) SeriesByID(id uint64) *Series {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+	return m.seriesByID[id]
+}
+
 // SeriesKeys returns the keys of every series in this measurement
 func (m *Measurement) SeriesKeys() []string {
 	m.mu.RLock()
@ -321,7 +348,7 @@ func (m *Measurement) ValidateGroupBy(stmt *influxql.SelectStatement) error {
 	for _, d := range stmt.Dimensions {
 		switch e := d.Expr.(type) {
 		case *influxql.VarRef:
-			if !m.HasTagKey(e.Val) {
+			if m.HasField(e.Val) {
 				return fmt.Errorf("can not use field in GROUP BY clause: %s", e.Val)
 			}
 		}
@ -353,8 +380,6 @@ func (m *Measurement) AddSeries(s *Series) bool {
 		return false
 	}
 	m.seriesByID[s.id] = s
-	tagset := string(marshalTags(s.Tags))
-	m.series[tagset] = s
 	m.seriesIDs = append(m.seriesIDs, s.id)

 	// the series ID should always be higher than all others because it's a new
@ -367,7 +392,7 @@ func (m *Measurement) AddSeries(s *Series) bool {
 	for k, v := range s.Tags {
 		valueMap := m.seriesByTagKeyValue[k]
 		if valueMap == nil {
-			valueMap = make(map[string]seriesIDs)
+			valueMap = make(map[string]SeriesIDs)
 			m.seriesByTagKeyValue[k] = valueMap
 		}
 		ids := valueMap[v]
@ -392,10 +417,6 @@ func (m *Measurement) DropSeries(seriesID uint64) {
 	if _, ok := m.seriesByID[seriesID]; !ok {
 		return
 	}
-	s := m.seriesByID[seriesID]
-	tagset := string(marshalTags(s.Tags))
-
-	delete(m.series, tagset)
 	delete(m.seriesByID, seriesID)

 	var ids []uint64
@ -407,7 +428,7 @@ func (m *Measurement) DropSeries(seriesID uint64) {
 	m.seriesIDs = ids

 	// remove this series id to the tag index on the measurement
-	// s.seriesByTagKeyValue is defined as map[string]map[string]seriesIDs
+	// s.seriesByTagKeyValue is defined as map[string]map[string]SeriesIDs
 	for k, v := range m.seriesByTagKeyValue {
 		values := v
 		for kk, vv := range values {
@ -497,7 +518,7 @@ func (m *Measurement) TagSets(stmt *influxql.SelectStatement, dimensions []strin

 		// Convert the TagSet to a string, so it can be added to a map allowing TagSets to be handled
 		// as a set.
-		tagsAsKey := string(marshalTags(tags))
+		tagsAsKey := string(MarshalTags(tags))
 		tagSet, ok := tagSets[tagsAsKey]
 		if !ok {
 			// This TagSet is new, create a new entry for it.
@ -507,7 +528,7 @@ func (m *Measurement) TagSets(stmt *influxql.SelectStatement, dimensions []strin
 				tagsForSet[k] = v
 			}
 			tagSet.Tags = tagsForSet
-			tagSet.Key = marshalTags(tagsForSet)
+			tagSet.Key = MarshalTags(tagsForSet)
 		}

 		// Associate the series and filter with the Tagset.
@ -534,11 +555,11 @@ func (m *Measurement) TagSets(stmt *influxql.SelectStatement, dimensions []strin
 }

 // mergeSeriesFilters merges two sets of filter expressions and culls series IDs.
-func mergeSeriesFilters(op influxql.Token, ids seriesIDs, lfilters, rfilters map[uint64]influxql.Expr) (seriesIDs, map[uint64]influxql.Expr) {
+func mergeSeriesFilters(op influxql.Token, ids SeriesIDs, lfilters, rfilters map[uint64]influxql.Expr) (SeriesIDs, map[uint64]influxql.Expr) {
 	// Create a map to hold the final set of series filter expressions.
 	filters := make(map[uint64]influxql.Expr, 0)
 	// Resulting list of series IDs
-	var series seriesIDs
+	var series SeriesIDs

 	// Combining logic:
 	// +==========+==========+==========+=======================+=======================+
@ -603,7 +624,7 @@ func mergeSeriesFilters(op influxql.Token, ids seriesIDs, lfilters, rfilters map

 // idsForExpr will return a collection of series ids and a filter expression that should
 // be used to filter points from those series.
-func (m *Measurement) idsForExpr(n *influxql.BinaryExpr) (seriesIDs, influxql.Expr, error) {
+func (m *Measurement) idsForExpr(n *influxql.BinaryExpr) (SeriesIDs, influxql.Expr, error) {
 	name, ok := n.LHS.(*influxql.VarRef)
 	value := n.RHS
 	if !ok {
@ -632,20 +653,20 @@ func (m *Measurement) idsForExpr(n *influxql.BinaryExpr) (seriesIDs, influxql.Ex

 	// if we're looking for series with a specific tag value
 	if str, ok := value.(*influxql.StringLiteral); ok {
-		var ids seriesIDs
+		var ids SeriesIDs

 		if n.Op == influxql.EQ {
 			// return series that have a tag of specific value.
 			ids = tagVals[str.Val]
 		} else if n.Op == influxql.NEQ {
-			ids = m.seriesIDs.reject(tagVals[str.Val])
+			ids = m.seriesIDs.Reject(tagVals[str.Val])
 		}
 		return ids, &influxql.BooleanLiteral{Val: true}, nil
 	}

 	// if we're looking for series with a tag value that matches a regex
 	if re, ok := value.(*influxql.RegexLiteral); ok {
-		var ids seriesIDs
+		var ids SeriesIDs

 		// The operation is a NEQREGEX, code must start by assuming all match, even
 		// series without any tags.
@ -657,9 +678,9 @@ func (m *Measurement) idsForExpr(n *influxql.BinaryExpr) (seriesIDs, influxql.Ex
 			match := re.Val.MatchString(k)

 			if match && n.Op == influxql.EQREGEX {
-				ids = ids.union(tagVals[k])
+				ids = ids.Union(tagVals[k])
 			} else if match && n.Op == influxql.NEQREGEX {
-				ids = ids.reject(tagVals[k])
+				ids = ids.Reject(tagVals[k])
 			}
 		}
 		return ids, &influxql.BooleanLiteral{Val: true}, nil
@ -671,7 +692,7 @@ func (m *Measurement) idsForExpr(n *influxql.BinaryExpr) (seriesIDs, influxql.Ex
 // walkWhereForSeriesIds recursively walks the WHERE clause and returns an ordered set of series IDs and
 // a map from those series IDs to filter expressions that should be used to limit points returned in
 // the final query result.
-func (m *Measurement) walkWhereForSeriesIds(expr influxql.Expr) (seriesIDs, map[uint64]influxql.Expr, error) {
+func (m *Measurement) walkWhereForSeriesIds(expr influxql.Expr) (SeriesIDs, map[uint64]influxql.Expr, error) {
 	switch n := expr.(type) {
 	case *influxql.BinaryExpr:
 		switch n.Op {
@ -702,12 +723,12 @@ func (m *Measurement) walkWhereForSeriesIds(expr influxql.Expr) (seriesIDs, map[
 			}

 			// Combine the series IDs from the LHS and RHS.
-			var ids seriesIDs
+			var ids SeriesIDs
 			switch n.Op {
 			case influxql.AND:
-				ids = lids.intersect(rids)
+				ids = lids.Intersect(rids)
 			case influxql.OR:
-				ids = lids.union(rids)
+				ids = lids.Union(rids)
 			}

 			// Merge the filter expressions for the LHS and RHS.
@ -785,7 +806,7 @@ func expandExprWithValues(expr influxql.Expr, keys []string, tagExprs []tagExpr,

 // seriesIDsAllOrByExpr walks an expressions for matching series IDs
 // or, if no expressions is given, returns all series IDs for the measurement.
-func (m *Measurement) seriesIDsAllOrByExpr(expr influxql.Expr) (seriesIDs, error) {
+func (m *Measurement) seriesIDsAllOrByExpr(expr influxql.Expr) (SeriesIDs, error) {
 	// If no expression given or the measurement has no series,
 	// we can take just return the ids or nil accordingly.
 	if expr == nil {
@ -997,16 +1018,16 @@ func (s *Series) match(tags map[string]string) bool {
 	return true
 }

-// seriesIDs is a convenience type for sorting, checking equality, and doing
+// SeriesIDs is a convenience type for sorting, checking equality, and doing
 // union and intersection of collections of series ids.
-type seriesIDs []uint64
+type SeriesIDs []uint64

-func (a seriesIDs) Len() int           { return len(a) }
-func (a seriesIDs) Less(i, j int) bool { return a[i] < a[j] }
-func (a seriesIDs) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a SeriesIDs) Len() int           { return len(a) }
+func (a SeriesIDs) Less(i, j int) bool { return a[i] < a[j] }
+func (a SeriesIDs) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }

-// equals assumes that both are sorted.
-func (a seriesIDs) equals(other seriesIDs) bool {
+// Equals assumes that both are sorted.
+func (a SeriesIDs) Equals(other SeriesIDs) bool {
 	if len(a) != len(other) {
 		return false
 	}
@ -1018,9 +1039,9 @@ func (a seriesIDs) equals(other seriesIDs) bool {
 	return true
 }

-// intersect returns a new collection of series ids in sorted order that is the intersection of the two.
+// Intersect returns a new collection of series ids in sorted order that is the intersection of the two.
 // The two collections must already be sorted.
-func (a seriesIDs) intersect(other seriesIDs) seriesIDs {
+func (a SeriesIDs) Intersect(other SeriesIDs) SeriesIDs {
 	l := a
 	r := other

@ -1047,12 +1068,12 @@ func (a seriesIDs) intersect(other seriesIDs) seriesIDs {
 		}
 	}

-	return seriesIDs(ids)
+	return SeriesIDs(ids)
 }

-// union returns a new collection of series ids in sorted order that is the union of the two.
+// Union returns a new collection of series ids in sorted order that is the union of the two.
 // The two collections must already be sorted.
-func (a seriesIDs) union(other seriesIDs) seriesIDs {
+func (a SeriesIDs) Union(other SeriesIDs) SeriesIDs {
 	l := a
 	r := other
 	ids := make([]uint64, 0, len(l)+len(r))
@ -1081,9 +1102,9 @@ func (a seriesIDs) union(other seriesIDs) seriesIDs {
 	return ids
 }

-// reject returns a new collection of series ids in sorted order with the passed in set removed from the original.
+// Reject returns a new collection of series ids in sorted order with the passed in set removed from the original.
 // This is useful for the NOT operator. The two collections must already be sorted.
-func (a seriesIDs) reject(other seriesIDs) seriesIDs {
+func (a SeriesIDs) Reject(other SeriesIDs) SeriesIDs {
 	l := a
 	r := other
 	var i, j int
@ -1106,7 +1127,7 @@ func (a seriesIDs) reject(other seriesIDs) seriesIDs {
 		ids = append(ids, l[i:]...)
 	}

-	return seriesIDs(ids)
+	return SeriesIDs(ids)
 }

 // TagFilter represents a tag filter when looking up other tags or measurements.
@ -1118,7 +1139,7 @@ type TagFilter struct {
 }

 // used to convert the tag set to bytes for use as a lookup key
-func marshalTags(tags map[string]string) []byte {
+func MarshalTags(tags map[string]string) []byte {
 	// Empty maps marshal to empty bytes.
 	if len(tags) == 0 {
 		return nil
@ -1169,6 +1190,13 @@ func (m *Measurement) TagKeys() []string {
 	return keys
 }

+// SetFieldName adds the field name to the measurement.
+func (m *Measurement) SetFieldName(name string) {
+	m.mu.Lock()
+	m.fieldNames[name] = struct{}{}
+	m.mu.Unlock()
+}
+
 // FieldNames returns a list of the measurement's field names
 func (m *Measurement) FieldNames() (a []string) {
 	m.mu.RLock()
@ -1180,7 +1208,7 @@ func (m *Measurement) FieldNames() (a []string) {
 	return
 }

-func (m *Measurement) tagValuesByKeyAndSeriesID(tagKeys []string, ids seriesIDs) map[string]stringSet {
+func (m *Measurement) tagValuesByKeyAndSeriesID(tagKeys []string, ids SeriesIDs) map[string]stringSet {
 	// If no tag keys were passed, get all tag keys for the measurement.
 	if len(tagKeys) == 0 {
 		for k := range m.seriesByTagKeyValue {
@ -1221,9 +1249,11 @@ func newStringSet() stringSet {
 	return make(map[string]struct{})
 }

-// add adds a string to the set.
-func (s stringSet) add(ss string) {
-	s[ss] = struct{}{}
+// add adds strings to the set.
+func (s stringSet) add(ss ...string) {
+	for _, n := range ss {
+		s[n] = struct{}{}
+	}
 }

 // contains returns whether the set contains the given string.
@ -1270,7 +1300,7 @@ func (s stringSet) intersect(o stringSet) stringSet {
 	return ns
 }

-func measurementFromSeriesKey(key string) string {
+func MeasurementFromSeriesKey(key string) string {
 	idx := strings.Index(key, ",")
 	if idx == -1 {
 		return key
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/meta_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/meta_test.go
@ -1,4 +1,4 @@
-package tsdb
+package tsdb_test

 import (
 	"bytes"
@ -6,86 +6,87 @@ import (
 	"testing"

 	"github.com/influxdb/influxdb/influxql"
+	"github.com/influxdb/influxdb/tsdb"
 )

-// Test comparing seriesIDs for equality.
-func Test_seriesIDs_equals(t *testing.T) {
-	ids1 := seriesIDs{1, 2, 3}
-	ids2 := seriesIDs{1, 2, 3}
-	ids3 := seriesIDs{4, 5, 6}
+// Test comparing SeriesIDs for equality.
+func Test_SeriesIDs_Equals(t *testing.T) {
+	ids1 := tsdb.SeriesIDs{1, 2, 3}
+	ids2 := tsdb.SeriesIDs{1, 2, 3}
+	ids3 := tsdb.SeriesIDs{4, 5, 6}

-	if !ids1.equals(ids2) {
+	if !ids1.Equals(ids2) {
 		t.Fatal("expected ids1 == ids2")
-	} else if ids1.equals(ids3) {
+	} else if ids1.Equals(ids3) {
 		t.Fatal("expected ids1 != ids3")
 	}
 }

-// Test intersecting sets of seriesIDs.
-func Test_seriesIDs_intersect(t *testing.T) {
+// Test intersecting sets of SeriesIDs.
+func Test_SeriesIDs_Intersect(t *testing.T) {
 	// Test swaping l & r, all branches of if-else, and exit loop when 'j < len(r)'
-	ids1 := seriesIDs{1, 3, 4, 5, 6}
-	ids2 := seriesIDs{1, 2, 3, 7}
-	exp := seriesIDs{1, 3}
-	got := ids1.intersect(ids2)
+	ids1 := tsdb.SeriesIDs{1, 3, 4, 5, 6}
+	ids2 := tsdb.SeriesIDs{1, 2, 3, 7}
+	exp := tsdb.SeriesIDs{1, 3}
+	got := ids1.Intersect(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}

 	// Test exit for loop when 'i < len(l)'
-	ids1 = seriesIDs{1}
-	ids2 = seriesIDs{1, 2}
-	exp = seriesIDs{1}
-	got = ids1.intersect(ids2)
+	ids1 = tsdb.SeriesIDs{1}
+	ids2 = tsdb.SeriesIDs{1, 2}
+	exp = tsdb.SeriesIDs{1}
+	got = ids1.Intersect(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}
 }

-// Test union sets of seriesIDs.
-func Test_seriesIDs_union(t *testing.T) {
+// Test union sets of SeriesIDs.
+func Test_SeriesIDs_Union(t *testing.T) {
 	// Test all branches of if-else, exit loop because of 'j < len(r)', and append remainder from left.
-	ids1 := seriesIDs{1, 2, 3, 7}
-	ids2 := seriesIDs{1, 3, 4, 5, 6}
-	exp := seriesIDs{1, 2, 3, 4, 5, 6, 7}
-	got := ids1.union(ids2)
+	ids1 := tsdb.SeriesIDs{1, 2, 3, 7}
+	ids2 := tsdb.SeriesIDs{1, 3, 4, 5, 6}
+	exp := tsdb.SeriesIDs{1, 2, 3, 4, 5, 6, 7}
+	got := ids1.Union(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}

 	// Test exit because of 'i < len(l)' and append remainder from right.
-	ids1 = seriesIDs{1}
-	ids2 = seriesIDs{1, 2}
-	exp = seriesIDs{1, 2}
-	got = ids1.union(ids2)
+	ids1 = tsdb.SeriesIDs{1}
+	ids2 = tsdb.SeriesIDs{1, 2}
+	exp = tsdb.SeriesIDs{1, 2}
+	got = ids1.Union(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}
 }

-// Test removing one set of seriesIDs from another.
-func Test_seriesIDs_reject(t *testing.T) {
+// Test removing one set of SeriesIDs from another.
+func Test_SeriesIDs_Reject(t *testing.T) {
 	// Test all branches of if-else, exit loop because of 'j < len(r)', and append remainder from left.
-	ids1 := seriesIDs{1, 2, 3, 7}
-	ids2 := seriesIDs{1, 3, 4, 5, 6}
-	exp := seriesIDs{2, 7}
-	got := ids1.reject(ids2)
+	ids1 := tsdb.SeriesIDs{1, 2, 3, 7}
+	ids2 := tsdb.SeriesIDs{1, 3, 4, 5, 6}
+	exp := tsdb.SeriesIDs{2, 7}
+	got := ids1.Reject(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}

 	// Test exit because of 'i < len(l)'.
-	ids1 = seriesIDs{1}
-	ids2 = seriesIDs{1, 2}
-	exp = seriesIDs{}
-	got = ids1.reject(ids2)
+	ids1 = tsdb.SeriesIDs{1}
+	ids2 = tsdb.SeriesIDs{1, 2}
+	exp = tsdb.SeriesIDs{}
+	got = ids1.Reject(ids2)

-	if !exp.equals(got) {
+	if !exp.Equals(got) {
 		t.Fatalf("exp=%v, got=%v", exp, got)
 	}
 }
@ -113,7 +114,7 @@ func TestMarshalTags(t *testing.T) {
 			result: []byte(`baz|foo|battttt|bar`),
 		},
 	} {
-		result := marshalTags(tt.tags)
+		result := tsdb.MarshalTags(tt.tags)
 		if !bytes.Equal(result, tt.result) {
 			t.Fatalf("%d. unexpected result: exp=%s, got=%s", i, tt.result, result)
 		}
@ -137,7 +138,7 @@ func benchmarkMarshalTags(b *testing.B, keyN int) {
 	// Unmarshal map into byte slice.
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
-		marshalTags(tags)
+		tsdb.MarshalTags(tags)
 	}
 }

@ -154,23 +155,23 @@ func BenchmarkCreateSeriesIndex_1M(b *testing.B) {
 }

 func benchmarkCreateSeriesIndex(b *testing.B, series []*TestSeries) {
-	idxs := make([]*DatabaseIndex, 0, b.N)
+	idxs := make([]*tsdb.DatabaseIndex, 0, b.N)
 	for i := 0; i < b.N; i++ {
-		idxs = append(idxs, NewDatabaseIndex())
+		idxs = append(idxs, tsdb.NewDatabaseIndex())
 	}

 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
 		idx := idxs[n]
 		for _, s := range series {
-			idx.createSeriesIndexIfNotExists(s.Measurement, s.Series)
+			idx.CreateSeriesIndexIfNotExists(s.Measurement, s.Series)
 		}
 	}
 }

 type TestSeries struct {
 	Measurement string
-	Series      *Series
+	Series      *tsdb.Series
 }

 func genTestSeries(mCnt, tCnt, vCnt int) []*TestSeries {
@ -181,8 +182,8 @@ func genTestSeries(mCnt, tCnt, vCnt int) []*TestSeries {
 		for _, ts := range tagSets {
 			series = append(series, &TestSeries{
 				Measurement: m,
-				Series: &Series{
-					Key:  fmt.Sprintf("%s:%s", m, string(marshalTags(ts))),
+				Series: &tsdb.Series{
+					Key:  fmt.Sprintf("%s:%s", m, string(tsdb.MarshalTags(ts))),
 					Tags: ts,
 				},
 			})
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/points.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/points.go
@ -37,6 +37,13 @@ type Point interface {
 	String() string
 }

+// Points represents a sortable list of points by timestamp.
+type Points []Point
+
+func (a Points) Len() int           { return len(a) }
+func (a Points) Less(i, j int) bool { return a[i].Time().Before(a[j].Time()) }
+func (a Points) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+
 // point is the default implementation of Point.
 type point struct {
 	time time.Time
@ -109,7 +116,7 @@ func ParsePointsWithPrecision(buf []byte, defaultTime time.Time, precision strin
 		block []byte
 	)
 	for {
-		pos, block = scanTo(buf, pos, '\n')
+		pos, block = scanLine(buf, pos)
 		pos += 1

 		if len(block) == 0 {
@ -117,7 +124,14 @@ func ParsePointsWithPrecision(buf []byte, defaultTime time.Time, precision strin
 		}

 		// lines which start with '#' are comments
-		if start := skipWhitespace(block, 0); block[start] == '#' {
+		start := skipWhitespace(block, 0)
+
+		// If line is all whitespace, just skip it
+		if start >= len(block) {
+			continue
+		}
+
+		if block[start] == '#' {
 			continue
 		}

@ -222,6 +236,10 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 		}

 		if buf[i] == '=' {
+			if i-1 < 0 || i-2 < 0 {
+				return i, buf[start:i], fmt.Errorf("missing tag name")
+			}
+
 			// Check for "cpu,=value" but allow "cpu,a\,=value"
 			if buf[i-1] == ',' && buf[i-2] != '\\' {
 				return i, buf[start:i], fmt.Errorf("missing tag name")
@ -254,6 +272,13 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 				return i, buf[start:i], fmt.Errorf("missing tag value")
 			}
 			i += 1
+
+			// grow our indices slice if we have too many tags
+			if commas >= len(indices) {
+				newIndics := make([]int, cap(indices)*2)
+				copy(newIndics, indices)
+				indices = newIndics
+			}
 			indices[commas] = i
 			commas += 1

@ -273,6 +298,14 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 			if equals > 0 && commas-1 != equals-1 {
 				return i, buf[start:i], fmt.Errorf("missing tag value")
 			}
+
+			// grow our indices slice if we have too many tags
+			if commas >= len(indices) {
+				newIndics := make([]int, cap(indices)*2)
+				copy(newIndics, indices)
+				indices = newIndics
+			}
+
 			indices[commas] = i + 1
 			break
 		}
@ -286,6 +319,12 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 		return i, buf[start:i], fmt.Errorf("invalid tag format")
 	}

+	// This check makes sure we actually received fields from the user. #3379
+	// This will catch invalid syntax such as: `cpu,host=serverA,region=us-west`
+	if i >= len(buf) {
+		return i, buf[start:i], fmt.Errorf("missing fields")
+	}
+
 	// Now we know where the key region is within buf, and the locations of tags, we
 	// need to deterimine if duplicate tags exist and if the tags are sorted.  This iterates
 	// 1/2 of the list comparing each end with each other, walking towards the center from
@ -408,21 +447,20 @@ func scanFields(buf []byte, i int) (int, []byte, error) {

 			if isNumeric(buf[i+1]) || buf[i+1] == '-' || buf[i+1] == 'N' || buf[i+1] == 'n' {
 				var err error
-				i, _, err = scanNumber(buf, i+1)
+				i, err = scanNumber(buf, i+1)
 				if err != nil {
 					return i, buf[start:i], err
-				} else {
-					continue
 				}
-				// If next byte is not a double-quote, the value must be a boolean
-			} else if buf[i+1] != '"' {
+				continue
+			}
+			// If next byte is not a double-quote, the value must be a boolean
+			if buf[i+1] != '"' {
 				var err error
 				i, _, err = scanBoolean(buf, i+1)
 				if err != nil {
 					return i, buf[start:i], err
-				} else {
-					continue
 				}
+				continue
 			}
 		}

@ -483,8 +521,9 @@ func isNumeric(b byte) bool {
 // scanNumber returns the end position within buf, start at i after
 // scanning over buf for an integer, or float.  It returns an
 // error if a invalid number is scanned.
-func scanNumber(buf []byte, i int) (int, []byte, error) {
+func scanNumber(buf []byte, i int) (int, error) {
 	start := i
+	var isInt bool

 	// Is negative number?
 	if i < len(buf) && buf[i] == '-' {
@ -506,13 +545,19 @@ func scanNumber(buf []byte, i int) (int, []byte, error) {
 			break
 		}

+		if buf[i] == 'i' && i > start && !isInt {
+			isInt = true
+			i += 1
+			continue
+		}
+
 		if buf[i] == '.' {
 			decimals += 1
 		}

 		// Can't have more than 1 decimal (e.g. 1.1.1 should fail)
 		if decimals > 1 {
-			return i, buf[start:i], fmt.Errorf("invalid number")
+			return i, fmt.Errorf("invalid number")
 		}

 		// `e` is valid for floats but not as the first char
@ -534,36 +579,44 @@ func scanNumber(buf []byte, i int) (int, []byte, error) {
 				i += 3
 				continue
 			}
-			return i, buf[start:i], fmt.Errorf("invalid number")
+			return i, fmt.Errorf("invalid number")
 		}

 		if !isNumeric(buf[i]) {
-			return i, buf[start:i], fmt.Errorf("invalid number")
+			return i, fmt.Errorf("invalid number")
 		}
 		i += 1
 	}
+	if isInt && (decimals > 0 || scientific) {
+		return i, fmt.Errorf("invalid number")
+	}

 	// It's more common that numbers will be within min/max range for their type but we need to prevent
 	// out or range numbers from being parsed successfully.  This uses some simple heuristics to decide
 	// if we should parse the number to the actual type.  It does not do it all the time because it incurs
 	// extra allocations and we end up converting the type again when writing points to disk.
-	if decimals == 0 {
+	if isInt {
+		// Make sure the last char is an 'i' for integers (e.g. 9i10 is not valid)
+		if buf[i-1] != 'i' {
+			return i, fmt.Errorf("invalid number")
+		}
 		// Parse the int to check bounds the number of digits could be larger than the max range
-		if len(buf[start:i]) >= maxInt64Digits || len(buf[start:i]) >= minInt64Digits {
-			if _, err := strconv.ParseInt(string(buf[start:i]), 10, 64); err != nil {
-				return i, buf[start:i], fmt.Errorf("invalid integer")
+		// We subtract 1 from the index to remove the `i` from our tests
+		if len(buf[start:i-1]) >= maxInt64Digits || len(buf[start:i-1]) >= minInt64Digits {
+			if _, err := strconv.ParseInt(string(buf[start:i-1]), 10, 64); err != nil {
+				return i, fmt.Errorf("unable to parse integer %s: %s", buf[start:i-1], err)
 			}
 		}
 	} else {
 		// Parse the float to check bounds if it's scientific or the number of digits could be larger than the max range
 		if scientific || len(buf[start:i]) >= maxFloat64Digits || len(buf[start:i]) >= minFloat64Digits {
 			if _, err := strconv.ParseFloat(string(buf[start:i]), 10); err != nil {
-				return i, buf[start:i], fmt.Errorf("invalid float")
+				return i, fmt.Errorf("invalid float")
 			}
 		}
 	}

-	return i, buf[start:i], nil
+	return i, nil
 }

 // scanBoolean returns the end position within buf, start at i after
@ -633,10 +686,6 @@ func skipWhitespace(buf []byte, i int) int {
 			return i
 		}

-		if buf[i] == '\\' {
-			i += 2
-			continue
-		}
 		if buf[i] == ' ' || buf[i] == '\t' {
 			i += 1
 			continue
@ -646,6 +695,39 @@ func skipWhitespace(buf []byte, i int) int {
 	return i
 }

+// scanLine returns the end position in buf and the next line found within
+// buf.
+func scanLine(buf []byte, i int) (int, []byte) {
+	start := i
+	quoted := false
+	for {
+		// reached the end of buf?
+		if i >= len(buf) {
+			break
+		}
+
+		// If we see a double quote, makes sure it is not escaped
+		if buf[i] == '"' && buf[i-1] != '\\' {
+			i += 1
+			quoted = !quoted
+			continue
+		}
+
+		if buf[i] == '\\' {
+			i += 2
+			continue
+		}
+
+		if buf[i] == '\n' && !quoted {
+			break
+		}
+
+		i += 1
+	}
+
+	return i, buf[start:i]
+}
+
 // scanTo returns the end position in buf and the next consecutive block
 // of bytes, starting from i and ending with stop byte.  If there are leading
 // spaces or escaped chars, they are skipped.
@ -791,7 +873,7 @@ func unescapeQuoteString(in string) string {
 // NewPoint returns a new point with the given measurement name, tags, fields and timestamp
 func NewPoint(name string, tags Tags, fields Fields, time time.Time) Point {
 	return &point{
-		key:    makeKey([]byte(name), tags),
+		key:    MakeKey([]byte(name), tags),
 		time:   time,
 		fields: fields.MarshalBinary(),
 	}
@ -821,7 +903,7 @@ func (p *point) Name() string {

 // SetName updates the measurement name for the point
 func (p *point) SetName(name string) {
-	p.key = makeKey([]byte(name), p.Tags())
+	p.key = MakeKey([]byte(name), p.Tags())
 }

 // Time return the timestamp for the point
@ -863,20 +945,20 @@ func (p *point) Tags() Tags {
 	return tags
 }

-func makeKey(name []byte, tags Tags) []byte {
-	return append(escape(name), tags.hashKey()...)
+func MakeKey(name []byte, tags Tags) []byte {
+	return append(escape(name), tags.HashKey()...)
 }

 // SetTags replaces the tags for the point
 func (p *point) SetTags(tags Tags) {
-	p.key = makeKey(p.name(), tags)
+	p.key = MakeKey(p.name(), tags)
 }

 // AddTag adds or replaces a tag value for a point
 func (p *point) AddTag(key, value string) {
 	tags := p.Tags()
 	tags[key] = value
-	p.key = makeKey(p.name(), tags)
+	p.key = MakeKey(p.name(), tags)
 }

 // Fields returns the fields for the point
@ -950,7 +1032,7 @@ func (p *point) UnixNano() int64 {

 type Tags map[string]string

-func (t Tags) hashKey() []byte {
+func (t Tags) HashKey() []byte {
 	// Empty maps marshal to empty bytes.
 	if len(t) == 0 {
 		return nil
@ -995,6 +1077,10 @@ func (t Tags) hashKey() []byte {
 type Fields map[string]interface{}

 func parseNumber(val []byte) (interface{}, error) {
+	if val[len(val)-1] == 'i' {
+		val = val[:len(val)-1]
+		return strconv.ParseInt(string(val), 10, 64)
+	}
 	for i := 0; i < len(val); i++ {
 		// If there is a decimal or an N (NaN), I (Inf), parse as float
 		if val[i] == '.' || val[i] == 'N' || val[i] == 'n' || val[i] == 'I' || val[i] == 'i' || val[i] == 'e' {
@ -1004,7 +1090,7 @@ func parseNumber(val []byte) (interface{}, error) {
 			return string(val), nil
 		}
 	}
-	return strconv.ParseInt(string(val), 10, 64)
+	return strconv.ParseFloat(string(val), 64)
 }

 func newFieldsFromBinary(buf []byte) Fields {
@ -1024,6 +1110,7 @@ func newFieldsFromBinary(buf []byte) Fields {
 		if len(name) == 0 {
 			continue
 		}
+		name = unescape(name)

 		i, valueBuf = scanFieldValue(buf, i+1)
 		if len(valueBuf) == 0 {
@ -1051,7 +1138,7 @@ func newFieldsFromBinary(buf []byte) Fields {
 				panic(fmt.Sprintf("unable to parse bool value '%v': %v\n", string(valueBuf), err))
 			}
 		}
-		fields[string(unescape(name))] = value
+		fields[string(name)] = value
 		i += 1
 	}
 	return fields
@ -1074,12 +1161,16 @@ func (p Fields) MarshalBinary() []byte {
 		switch t := v.(type) {
 		case int:
 			b = append(b, []byte(strconv.FormatInt(int64(t), 10))...)
+			b = append(b, 'i')
 		case int32:
 			b = append(b, []byte(strconv.FormatInt(int64(t), 10))...)
+			b = append(b, 'i')
 		case uint64:
 			b = append(b, []byte(strconv.FormatUint(t, 10))...)
+			b = append(b, 'i')
 		case int64:
 			b = append(b, []byte(strconv.FormatInt(t, 10))...)
+			b = append(b, 'i')
 		case float64:
 			// ensure there is a decimal in the encoded for

--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/points_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/points_test.go
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/query_executor.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/query_executor.go
@ -44,13 +44,13 @@ type QueryExecutor struct {
 	Logger *log.Logger

 	// the local data store
-	store *Store
+	Store *Store
 }

 // NewQueryExecutor returns an initialized QueryExecutor
 func NewQueryExecutor(store *Store) *QueryExecutor {
 	return &QueryExecutor{
-		store:  store,
+		Store:  store,
 		Logger: log.New(os.Stderr, "[query] ", log.LstdFlags),
 	}
 }
@ -199,7 +199,7 @@ func (q *QueryExecutor) ExecuteQuery(query *influxql.Query, database string, chu
 }

 // Plan creates an execution plan for the given SelectStatement and returns an Executor.
-func (q *QueryExecutor) plan(stmt *influxql.SelectStatement, chunkSize int) (Executor, error) {
+func (q *QueryExecutor) Plan(stmt *influxql.SelectStatement, chunkSize int) (*Executor, error) {
 	shards := map[uint64]meta.ShardInfo{} // Shards requiring mappers.

 	// Replace instances of "now()" with the current time, and check the resultant times.
@ -245,31 +245,14 @@ func (q *QueryExecutor) plan(stmt *influxql.SelectStatement, chunkSize int) (Exe
 		mappers = append(mappers, m)
 	}

-	var executor Executor
-	if len(mappers) > 0 {
-		// All Mapper are of same type, so check first to determine correct Executor type.
-		if _, ok := mappers[0].(*RawMapper); ok {
-			executor = NewRawExecutor(stmt, mappers, chunkSize)
-		} else {
-			executor = NewAggregateExecutor(stmt, mappers)
-		}
-	} else {
-		// With no mappers, the Executor type doesn't matter.
-		executor = NewRawExecutor(stmt, nil, chunkSize)
-	}
+	executor := NewExecutor(stmt, mappers, chunkSize)
 	return executor, nil
 }

 // executeSelectStatement plans and executes a select statement against a database.
 func (q *QueryExecutor) executeSelectStatement(statementID int, stmt *influxql.SelectStatement, results chan *influxql.Result, chunkSize int) error {
-	// Perform any necessary query re-writing.
-	stmt, err := q.rewriteSelectStatement(stmt)
-	if err != nil {
-		return err
-	}
-
 	// Plan statement execution.
-	e, err := q.plan(stmt, chunkSize)
+	e, err := q.Plan(stmt, chunkSize)
 	if err != nil {
 		return err
 	}
@ -282,10 +265,9 @@ func (q *QueryExecutor) executeSelectStatement(statementID int, stmt *influxql.S
 	for row := range ch {
 		if row.Err != nil {
 			return row.Err
-		} else {
-			resultSent = true
-			results <- &influxql.Result{StatementID: statementID, Series: []*influxql.Row{row}}
 		}
+		resultSent = true
+		results <- &influxql.Result{StatementID: statementID, Series: []*influxql.Row{row}}
 	}

 	if !resultSent {
@ -295,85 +277,6 @@ func (q *QueryExecutor) executeSelectStatement(statementID int, stmt *influxql.S
 	return nil
 }

-// rewriteSelectStatement performs any necessary query re-writing.
-func (q *QueryExecutor) rewriteSelectStatement(stmt *influxql.SelectStatement) (*influxql.SelectStatement, error) {
-	var err error
-
-	// Expand regex expressions in the FROM clause.
-	sources, err := q.expandSources(stmt.Sources)
-	if err != nil {
-		return nil, err
-	}
-	stmt.Sources = sources
-
-	// Expand wildcards in the fields or GROUP BY.
-	if stmt.HasWildcard() {
-		stmt, err = q.expandWildcards(stmt)
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	stmt.RewriteDistinct()
-
-	return stmt, nil
-}
-
-// expandWildcards returns a new SelectStatement with wildcards in the fields
-// and/or GROUP BY expanded with actual field names.
-func (q *QueryExecutor) expandWildcards(stmt *influxql.SelectStatement) (*influxql.SelectStatement, error) {
-	// If there are no wildcards in the statement, return it as-is.
-	if !stmt.HasWildcard() {
-		return stmt, nil
-	}
-
-	// Use sets to avoid duplicate field names.
-	fieldSet := map[string]struct{}{}
-	dimensionSet := map[string]struct{}{}
-
-	var fields influxql.Fields
-	var dimensions influxql.Dimensions
-
-	// Iterate measurements in the FROM clause getting the fields & dimensions for each.
-	for _, src := range stmt.Sources {
-		if m, ok := src.(*influxql.Measurement); ok {
-			// Lookup the database. The database may not exist if no data for this database
-			// was ever written to the shard.
-			db := q.store.DatabaseIndex(m.Database)
-			if db == nil {
-				return stmt, nil
-			}
-
-			// Lookup the measurement in the database.
-			mm := db.measurements[m.Name]
-			if mm == nil {
-				return nil, ErrMeasurementNotFound(m.String())
-			}
-
-			// Get the fields for this measurement.
-			for _, name := range mm.FieldNames() {
-				if _, ok := fieldSet[name]; ok {
-					continue
-				}
-				fieldSet[name] = struct{}{}
-				fields = append(fields, &influxql.Field{Expr: &influxql.VarRef{Val: name}})
-			}
-
-			// Get the dimensions for this measurement.
-			for _, t := range mm.TagKeys() {
-				if _, ok := dimensionSet[t]; ok {
-					continue
-				}
-				dimensionSet[t] = struct{}{}
-				dimensions = append(dimensions, &influxql.Dimension{Expr: &influxql.VarRef{Val: t}})
-			}
-		}
-	}
-
-	// Return a new SelectStatement with the wild cards rewritten.
-	return stmt.RewriteWildcards(fields, dimensions), nil
-}
-
 // expandSources expands regex sources and removes duplicates.
 // NOTE: sources must be normalized (db and rp set) before calling this function.
 func (q *QueryExecutor) expandSources(sources influxql.Sources) (influxql.Sources, error) {
@ -394,7 +297,7 @@ func (q *QueryExecutor) expandSources(sources influxql.Sources) (influxql.Source
 			}

 			// Lookup the database.
-			db := q.store.DatabaseIndex(src.Database)
+			db := q.Store.DatabaseIndex(src.Database)
 			if db == nil {
 				return nil, nil
 			}
@ -453,7 +356,7 @@ func (q *QueryExecutor) executeDropDatabaseStatement(stmt *influxql.DropDatabase
 		}
 	}

-	err = q.store.DeleteDatabase(stmt.Name, shardIDs)
+	err = q.Store.DeleteDatabase(stmt.Name, shardIDs)
 	if err != nil {
 		return &influxql.Result{Err: err}
 	}
@ -464,7 +367,7 @@ func (q *QueryExecutor) executeDropDatabaseStatement(stmt *influxql.DropDatabase
 // executeDropMeasurementStatement removes the measurement and all series data from the local store for the given measurement
 func (q *QueryExecutor) executeDropMeasurementStatement(stmt *influxql.DropMeasurementStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -478,7 +381,7 @@ func (q *QueryExecutor) executeDropMeasurementStatement(stmt *influxql.DropMeasu
 	db.DropMeasurement(m.Name)

 	// now drop the raw data
-	if err := q.store.deleteMeasurement(m.Name, m.SeriesKeys()); err != nil {
+	if err := q.Store.deleteMeasurement(m.Name, m.SeriesKeys()); err != nil {
 		return &influxql.Result{Err: err}
 	}

@ -488,7 +391,7 @@ func (q *QueryExecutor) executeDropMeasurementStatement(stmt *influxql.DropMeasu
 // executeDropSeriesStatement removes all series from the local store that match the drop query
 func (q *QueryExecutor) executeDropSeriesStatement(stmt *influxql.DropSeriesStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -506,7 +409,7 @@ func (q *QueryExecutor) executeDropSeriesStatement(stmt *influxql.DropSeriesStat

 	var seriesKeys []string
 	for _, m := range measurements {
-		var ids seriesIDs
+		var ids SeriesIDs
 		if stmt.Condition != nil {
 			// Get series IDs that match the WHERE clause.
 			ids, _, err = m.walkWhereForSeriesIds(stmt.Condition)
@ -524,7 +427,7 @@ func (q *QueryExecutor) executeDropSeriesStatement(stmt *influxql.DropSeriesStat
 	}

 	// delete the raw series data
-	if err := q.store.deleteSeries(seriesKeys); err != nil {
+	if err := q.Store.deleteSeries(seriesKeys); err != nil {
 		return &influxql.Result{Err: err}
 	}
 	// remove them from the index
@ -535,7 +438,7 @@ func (q *QueryExecutor) executeDropSeriesStatement(stmt *influxql.DropSeriesStat

 func (q *QueryExecutor) executeShowSeriesStatement(stmt *influxql.ShowSeriesStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -559,7 +462,7 @@ func (q *QueryExecutor) executeShowSeriesStatement(stmt *influxql.ShowSeriesStat

 	// Loop through measurements to build result. One result row / measurement.
 	for _, m := range measurements {
-		var ids seriesIDs
+		var ids SeriesIDs

 		if stmt.Condition != nil {
 			// Get series IDs that match the WHERE clause.
@ -646,7 +549,7 @@ func (q *QueryExecutor) filterShowSeriesResult(limit, offset int, rows influxql.

 func (q *QueryExecutor) executeShowMeasurementsStatement(stmt *influxql.ShowMeasurementsStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -705,7 +608,7 @@ func (q *QueryExecutor) executeShowMeasurementsStatement(stmt *influxql.ShowMeas

 func (q *QueryExecutor) executeShowTagKeysStatement(stmt *influxql.ShowTagKeysStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -758,7 +661,7 @@ func (q *QueryExecutor) executeShowTagKeysStatement(stmt *influxql.ShowTagKeysSt

 func (q *QueryExecutor) executeShowTagValuesStatement(stmt *influxql.ShowTagValuesStatement, database string) *influxql.Result {
 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
@ -782,7 +685,7 @@ func (q *QueryExecutor) executeShowTagValuesStatement(stmt *influxql.ShowTagValu

 	tagValues := make(map[string]stringSet)
 	for _, m := range measurements {
-		var ids seriesIDs
+		var ids SeriesIDs

 		if stmt.Condition != nil {
 			// Get series IDs that match the WHERE clause.
@ -836,7 +739,7 @@ func (q *QueryExecutor) executeShowFieldKeysStatement(stmt *influxql.ShowFieldKe
 	var err error

 	// Find the database.
-	db := q.store.DatabaseIndex(database)
+	db := q.Store.DatabaseIndex(database)
 	if db == nil {
 		return &influxql.Result{}
 	}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/query_executor_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/query_executor_test.go
@ -1,6 +1,7 @@
-package tsdb
+package tsdb_test

 import (
+	"encoding/json"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@ -10,6 +11,7 @@ import (

 	"github.com/influxdb/influxdb/influxql"
 	"github.com/influxdb/influxdb/meta"
+	"github.com/influxdb/influxdb/tsdb"
 )

 var sgID = uint64(2)
@ -17,10 +19,10 @@ var shardID = uint64(1)

 func TestWritePointsAndExecuteQuery(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

 	// Write first point.
-	if err := store.WriteToShard(shardID, []Point{NewPoint(
+	if err := store.WriteToShard(shardID, []tsdb.Point{tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
@ -30,7 +32,7 @@ func TestWritePointsAndExecuteQuery(t *testing.T) {
 	}

 	// Write second point.
-	if err := store.WriteToShard(shardID, []Point{NewPoint(
+	if err := store.WriteToShard(shardID, []tsdb.Point{tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
@ -39,100 +41,90 @@ func TestWritePointsAndExecuteQuery(t *testing.T) {
 		t.Fatalf(err.Error())
 	}

-	got := executeAndGetJSON("select * from cpu", executor)
-	exepected := `[{"series":[{"name":"cpu","tags":{"host":"server"},"columns":["time","value"],"values":[["1970-01-01T00:00:01.000000002Z",1],["1970-01-01T00:00:02.000000003Z",1]]}]}]`
+	got := executeAndGetJSON("SELECT * FROM cpu", executor)
+	exepected := `[{"series":[{"name":"cpu","columns":["time","host","value"],"values":[["1970-01-01T00:00:01.000000002Z","server",1],["1970-01-01T00:00:02.000000003Z","server",1]]}]}]`
 	if exepected != got {
-		t.Fatalf("exp: %s\ngot: %s", exepected, got)
+		t.Fatalf("\nexp: %s\ngot: %s", exepected, got)
+	}
+
+	got = executeAndGetJSON("SELECT * FROM cpu GROUP BY *", executor)
+	exepected = `[{"series":[{"name":"cpu","tags":{"host":"server"},"columns":["time","value"],"values":[["1970-01-01T00:00:01.000000002Z",1],["1970-01-01T00:00:02.000000003Z",1]]}]}]`
+	if exepected != got {
+		t.Fatalf("\nexp: %s\ngot: %s", exepected, got)
 	}

 	store.Close()
-	store = NewStore(store.path)
+	store = tsdb.NewStore(store.Path())
 	if err := store.Open(); err != nil {
 		t.Fatalf(err.Error())
 	}
-	executor.store = store
+	executor.Store = store
 	executor.ShardMapper = &testShardMapper{store: store}

-	got = executeAndGetJSON("select * from cpu", executor)
+	got = executeAndGetJSON("SELECT * FROM cpu GROUP BY *", executor)
 	if exepected != got {
-		t.Fatalf("exp: %s\ngot: %s", exepected, got)
+		t.Fatalf("\nexp: %s\ngot: %s", exepected, got)
 	}
 }

-// Ensure that points can be written and flushed even after a restart.
-func TestWritePointsAndExecuteQuery_FlushRestart(t *testing.T) {
+// Ensure writing a point and updating it results in only a single point.
+func TestWritePointsAndExecuteQuery_Update(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

-	// Write first point.
-	if err := store.WriteToShard(shardID, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "server"},
-		map[string]interface{}{"value": 1.0},
-		time.Unix(1, 2),
+	// Write original point.
+	if err := store.WriteToShard(1, []tsdb.Point{tsdb.NewPoint(
+		"temperature",
+		map[string]string{},
+		map[string]interface{}{"value": 100.0},
+		time.Unix(0, 0),
 	)}); err != nil {
 		t.Fatalf(err.Error())
 	}

-	// Write second point.
-	if err := store.WriteToShard(shardID, []Point{NewPoint(
-		"cpu",
-		map[string]string{"host": "server"},
-		map[string]interface{}{"value": 1.0},
-		time.Unix(2, 3),
-	)}); err != nil {
-		t.Fatalf(err.Error())
-	}
-
-	// Restart the store.
-	if err := store.Close(); err != nil {
-		t.Fatal(err)
-	} else if err = store.Open(); err != nil {
-		t.Fatal(err)
-	}
-
-	// Flush WAL data to the index.
-	if err := store.Flush(); err != nil {
-		t.Fatal(err)
-	}
-
-	got := executeAndGetJSON("select * from cpu", executor)
-	exepected := `[{"series":[{"name":"cpu","tags":{"host":"server"},"columns":["time","value"],"values":[["1970-01-01T00:00:01.000000002Z",1],["1970-01-01T00:00:02.000000003Z",1]]}]}]`
-	if exepected != got {
-		t.Fatalf("exp: %s\ngot: %s", exepected, got)
-	}
-
+	// Restart store.
 	store.Close()
-	store = NewStore(store.path)
+	store = tsdb.NewStore(store.Path())
 	if err := store.Open(); err != nil {
 		t.Fatalf(err.Error())
 	}
-	executor.store = store
+	executor.Store = store
 	executor.ShardMapper = &testShardMapper{store: store}

-	got = executeAndGetJSON("select * from cpu", executor)
-	if exepected != got {
-		t.Fatalf("exp: %s\ngot: %s", exepected, got)
+	// Rewrite point with new value.
+	if err := store.WriteToShard(1, []tsdb.Point{tsdb.NewPoint(
+		"temperature",
+		map[string]string{},
+		map[string]interface{}{"value": 200.0},
+		time.Unix(0, 0),
+	)}); err != nil {
+		t.Fatalf(err.Error())
+	}
+
+	got := executeAndGetJSON("select * from temperature", executor)
+	exp := `[{"series":[{"name":"temperature","columns":["time","value"],"values":[["1970-01-01T00:00:00Z",200]]}]}]`
+	if exp != got {
+		t.Fatalf("\n\nexp: %s\ngot: %s", exp, got)
 	}
 }

 func TestDropSeriesStatement(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

-	pt := NewPoint(
+	pt := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)

-	err := store.WriteToShard(shardID, []Point{pt})
+	err := store.WriteToShard(shardID, []tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

-	got := executeAndGetJSON("select * from cpu", executor)
+	got := executeAndGetJSON("SELECT * FROM cpu GROUP BY *", executor)
 	exepected := `[{"series":[{"name":"cpu","tags":{"host":"server"},"columns":["time","value"],"values":[["1970-01-01T00:00:01.000000002Z",1]]}]}]`
 	if exepected != got {
 		t.Fatalf("exp: %s\ngot: %s", exepected, got)
@ -140,7 +132,7 @@ func TestDropSeriesStatement(t *testing.T) {

 	got = executeAndGetJSON("drop series from cpu", executor)

-	got = executeAndGetJSON("select * from cpu", executor)
+	got = executeAndGetJSON("SELECT * FROM cpu GROUP BY *", executor)
 	exepected = `[{}]`
 	if exepected != got {
 		t.Fatalf("exp: %s\ngot: %s", exepected, got)
@ -153,9 +145,9 @@ func TestDropSeriesStatement(t *testing.T) {
 	}

 	store.Close()
-	store = NewStore(store.path)
+	store = tsdb.NewStore(store.Path())
 	store.Open()
-	executor.store = store
+	executor.Store = store

 	got = executeAndGetJSON("select * from cpu", executor)
 	exepected = `[{}]`
@ -172,22 +164,22 @@ func TestDropSeriesStatement(t *testing.T) {

 func TestDropMeasurementStatement(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

-	pt := NewPoint(
+	pt := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)
-	pt2 := NewPoint(
+	pt2 := tsdb.NewPoint(
 		"memory",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)

-	if err := store.WriteToShard(shardID, []Point{pt, pt2}); err != nil {
+	if err := store.WriteToShard(shardID, []tsdb.Point{pt, pt2}); err != nil {
 		t.Fatal(err)
 	}

@ -215,7 +207,7 @@ func TestDropMeasurementStatement(t *testing.T) {
 			t.Fatalf("exp: %s\ngot: %s", exepected, got)
 		}
 		got = executeAndGetJSON("select * from memory", executor)
-		exepected = `[{"error":"measurement not found: \"foo\".\"foo\".memory"}]`
+		exepected = `[{}]`
 		if exepected != got {
 			t.Fatalf("exp: %s\ngot: %s", exepected, got)
 		}
@ -223,9 +215,9 @@ func TestDropMeasurementStatement(t *testing.T) {

 	validateDrop()
 	store.Close()
-	store = NewStore(store.path)
+	store = tsdb.NewStore(store.Path())
 	store.Open()
-	executor.store = store
+	executor.Store = store
 	validateDrop()
 }

@ -240,20 +232,20 @@ func (m *metaExec) ExecuteStatement(stmt influxql.Statement) *influxql.Result {

 func TestDropDatabase(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

-	pt := NewPoint(
+	pt := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)

-	if err := store.WriteToShard(shardID, []Point{pt}); err != nil {
+	if err := store.WriteToShard(shardID, []tsdb.Point{pt}); err != nil {
 		t.Fatal(err)
 	}

-	got := executeAndGetJSON("select * from cpu", executor)
+	got := executeAndGetJSON("SELECT * FROM cpu GROUP BY *", executor)
 	expected := `[{"series":[{"name":"cpu","tags":{"host":"server"},"columns":["time","value"],"values":[["1970-01-01T00:00:01.000000002Z",1]]}]}]`
 	if expected != got {
 		t.Fatalf("exp: %s\ngot: %s", expected, got)
@ -267,7 +259,7 @@ func TestDropDatabase(t *testing.T) {
 	executor.MetaStatementExecutor = me

 	// verify the database is there on disk
-	dbPath := filepath.Join(store.path, "foo")
+	dbPath := filepath.Join(store.Path(), "foo")
 	if _, err := os.Stat(dbPath); err != nil {
 		t.Fatalf("execpted database dir %s to exist", dbPath)
 	}
@ -287,12 +279,12 @@ func TestDropDatabase(t *testing.T) {
 	}

 	store.Close()
-	store = NewStore(store.path)
+	store = tsdb.NewStore(store.Path())
 	store.Open()
-	executor.store = store
+	executor.Store = store
 	executor.ShardMapper = &testShardMapper{store: store}

-	if err := store.WriteToShard(shardID, []Point{pt}); err == nil || err.Error() != "shard not found" {
+	if err := store.WriteToShard(shardID, []tsdb.Point{pt}); err == nil || err.Error() != "shard not found" {
 		t.Fatalf("expected shard to not be found")
 	}
 }
@ -300,7 +292,7 @@ func TestDropDatabase(t *testing.T) {
 // Ensure that queries for which there is no data result in an empty set.
 func TestQueryNoData(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())

 	got := executeAndGetJSON("select * from /.*/", executor)
 	expected := `[{}]`
@ -321,7 +313,7 @@ func TestQueryNoData(t *testing.T) {
 // to create a user.
 func TestAuthenticateIfUserCountZeroAndCreateUser(t *testing.T) {
 	store, executor := testStoreAndExecutor()
-	defer os.RemoveAll(store.path)
+	defer os.RemoveAll(store.Path())
 	ms := &testMetastore{userCount: 0}
 	executor.MetaStore = ms

@ -348,10 +340,10 @@ func TestAuthenticateIfUserCountZeroAndCreateUser(t *testing.T) {
 	}
 }

-func testStoreAndExecutor() (*Store, *QueryExecutor) {
+func testStoreAndExecutor() (*tsdb.Store, *tsdb.QueryExecutor) {
 	path, _ := ioutil.TempDir("", "")

-	store := NewStore(path)
+	store := tsdb.NewStore(path)
 	err := store.Open()
 	if err != nil {
 		panic(err)
@ -361,14 +353,14 @@ func testStoreAndExecutor() (*Store, *QueryExecutor) {
 	shardID := uint64(1)
 	store.CreateShard(database, retentionPolicy, shardID)

-	executor := NewQueryExecutor(store)
+	executor := tsdb.NewQueryExecutor(store)
 	executor.MetaStore = &testMetastore{}
 	executor.ShardMapper = &testShardMapper{store: store}

 	return store, executor
 }

-func executeAndGetJSON(query string, executor *QueryExecutor) string {
+func executeAndGetJSON(query string, executor *tsdb.QueryExecutor) string {
 	ch, err := executor.ExecuteQuery(mustParseQuery(query), "foo", 20)
 	if err != nil {
 		panic(err.Error())
@ -378,7 +370,12 @@ func executeAndGetJSON(query string, executor *QueryExecutor) string {
 	for r := range ch {
 		results = append(results, r)
 	}
-	return string(mustMarshalJSON(results))
+
+	b, err := json.Marshal(results)
+	if err != nil {
+		panic(err)
+	}
+	return string(b)
 }

 type testMetastore struct {
@ -467,10 +464,10 @@ func (t *testMetastore) NodeID() uint64 {
 }

 type testShardMapper struct {
-	store *Store
+	store *tsdb.Store
 }

-func (t *testShardMapper) CreateMapper(shard meta.ShardInfo, stmt string, chunkSize int) (Mapper, error) {
+func (t *testShardMapper) CreateMapper(shard meta.ShardInfo, stmt string, chunkSize int) (tsdb.Mapper, error) {
 	m, err := t.store.CreateMapper(shard.ID, stmt, chunkSize)
 	return m, err
 }
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/shard.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/shard.go
@ -1,19 +1,14 @@
 package tsdb

 import (
-	"bytes"
 	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
-	"hash/fnv"
 	"io"
-	"log"
 	"math"
 	"os"
-	"sort"
 	"sync"
-	"time"

 	"github.com/influxdb/influxdb/influxql"
 	"github.com/influxdb/influxdb/tsdb/internal"
@ -35,15 +30,8 @@ var (
 	// ErrFieldUnmappedID is returned when the system is presented, during decode, with a field ID
 	// there is no mapping for.
 	ErrFieldUnmappedID = errors.New("field ID not mapped")
-
-	// ErrWALPartitionNotFound is returns when flushing a WAL partition that
-	// does not exist.
-	ErrWALPartitionNotFound = errors.New("wal partition not found")
 )

-// topLevelBucketN is the number of non-series buckets in the bolt db.
-const topLevelBucketN = 3
-
 // Shard represents a self-contained time series database. An inverted index of
 // the measurement and tag data is kept along with the raw time series data.
 // Data can be split across many shards. The query engine in TSDB is responsible
@ -52,53 +40,27 @@ type Shard struct {
 	db    *bolt.DB // underlying data store
 	index *DatabaseIndex
 	path  string
-	cache map[uint8]map[string][][]byte // values by <wal partition,series>

-	walSize    int           // approximate size of the WAL, in bytes
-	flush      chan struct{} // signals background flush
-	flushTimer *time.Timer   // signals time-based flush
+	engine  Engine
+	options EngineOptions

 	mu                sync.RWMutex
-	measurementFields map[string]*measurementFields // measurement name to their fields
-
-	// These coordinate closing and waiting for running goroutines.
-	wg      sync.WaitGroup
-	closing chan struct{}
-
-	// Used for out-of-band error messages.
-	logger *log.Logger
-
-	// The maximum size and time thresholds for flushing the WAL.
-	MaxWALSize             int
-	WALFlushInterval       time.Duration
-	WALPartitionFlushDelay time.Duration
+	measurementFields map[string]*MeasurementFields // measurement name to their fields

 	// The writer used by the logger.
 	LogOutput io.Writer
 }

 // NewShard returns a new initialized Shard
-func NewShard(index *DatabaseIndex, path string) *Shard {
-	s := &Shard{
+func NewShard(index *DatabaseIndex, path string, options EngineOptions) *Shard {
+	return &Shard{
 		index:             index,
 		path:              path,
-		flush:             make(chan struct{}, 1),
-		measurementFields: make(map[string]*measurementFields),
-
-		MaxWALSize:             DefaultMaxWALSize,
-		WALFlushInterval:       DefaultWALFlushInterval,
-		WALPartitionFlushDelay: DefaultWALPartitionFlushDelay,
+		options:           options,
+		measurementFields: make(map[string]*MeasurementFields),

 		LogOutput: os.Stderr,
 	}
-
-	// Initialize all partitions of the cache.
-	s.cache = make(map[uint8]map[string][][]byte)
-	for i := uint8(0); i < WALPartitionN; i++ {
-		s.cache[i] = make(map[string][][]byte)
-	}
-
-	return s
 }

 // Path returns the path set on the shard when it was created.
@ -110,87 +72,57 @@ func (s *Shard) Open() error {
 		s.mu.Lock()
 		defer s.mu.Unlock()

+		s.index.mu.Lock()
+		defer s.index.mu.Unlock()
+
 		// Return if the shard is already open
-		if s.db != nil {
+		if s.engine != nil {
 			return nil
 		}

-		// Open store on shard.
-		store, err := bolt.Open(s.path, 0666, &bolt.Options{Timeout: 1 * time.Second})
+		// Initialize underlying engine.
+		e, err := NewEngine(s.path, s.options)
 		if err != nil {
-			return err
+			return fmt.Errorf("new engine: %s", err)
 		}
-		s.db = store
+		s.engine = e

-		// Initialize store.
-		if err := s.db.Update(func(tx *bolt.Tx) error {
-			_, _ = tx.CreateBucketIfNotExists([]byte("series"))
-			_, _ = tx.CreateBucketIfNotExists([]byte("fields"))
-			_, _ = tx.CreateBucketIfNotExists([]byte("wal"))
+		// Set log output on the engine.
+		s.engine.SetLogOutput(s.LogOutput)

-			return nil
-		}); err != nil {
-			return fmt.Errorf("init: %s", err)
+		// Open engine.
+		if err := s.engine.Open(); err != nil {
+			return fmt.Errorf("open engine: %s", err)
 		}

-		if err := s.loadMetadataIndex(); err != nil {
+		// Load metadata index.
+		if err := s.engine.LoadMetadataIndex(s.index, s.measurementFields); err != nil {
 			return fmt.Errorf("load metadata index: %s", err)
 		}

-		// Initialize logger.
-		s.logger = log.New(s.LogOutput, "[shard] ", log.LstdFlags)
-
-		// Start flush interval timer.
-		s.flushTimer = time.NewTimer(s.WALFlushInterval)
-
-		// Start background goroutines.
-		s.wg.Add(1)
-		s.closing = make(chan struct{})
-		go s.autoflusher(s.closing)
-
 		return nil
 	}(); err != nil {
 		s.close()
 		return err
 	}

-	// Flush on-disk WAL before we return to the caller.
-	if err := s.Flush(0); err != nil {
-		return fmt.Errorf("flush: %s", err)
-	}
-
 	return nil
 }

 // Close shuts down the shard's store.
 func (s *Shard) Close() error {
 	s.mu.Lock()
-	err := s.close()
-	s.mu.Unlock()
-
-	// Wait for open goroutines to finish.
-	s.wg.Wait()
-
-	return err
+	defer s.mu.Unlock()
+	return s.close()
 }

 func (s *Shard) close() error {
-	if s.db != nil {
-		s.db.Close()
-	}
-	if s.closing != nil {
-		close(s.closing)
-		s.closing = nil
+	if s.engine != nil {
+		return s.engine.Close()
 	}
 	return nil
 }

-// TODO: this is temporarily exported to make tx.go work. When the query engine gets refactored
-// into the tsdb package this should be removed. No one outside tsdb should know the underlying store.
-func (s *Shard) DB() *bolt.DB {
-	return s.db
-}
-
 // TODO: this is temporarily exported to make tx.go work. When the query engine gets refactored
 // into the tsdb package this should be removed. No one outside tsdb should know the underlying field encoding scheme.
 func (s *Shard) FieldCodec(measurementName string) *FieldCodec {
@ -198,21 +130,21 @@ func (s *Shard) FieldCodec(measurementName string) *FieldCodec {
 	defer s.mu.RUnlock()
 	m := s.measurementFields[measurementName]
 	if m == nil {
-		return nil
+		return NewFieldCodec(nil)
 	}
-	return m.codec
+	return m.Codec
 }

 // struct to hold information for a field to create on a measurement
-type fieldCreate struct {
-	measurement string
-	field       *field
+type FieldCreate struct {
+	Measurement string
+	Field       *Field
 }

 // struct to hold information for a series to create
-type seriesCreate struct {
-	measurement string
-	series      *Series
+type SeriesCreate struct {
+	Measurement string
+	Series      *Series
 }

 // WritePoints will write the raw data points and any new metadata to the index in the shard
@ -226,7 +158,7 @@ func (s *Shard) WritePoints(points []Point) error {
 	if len(seriesToCreate) > 0 {
 		s.index.mu.Lock()
 		for _, ss := range seriesToCreate {
-			s.index.createSeriesIndexIfNotExists(ss.measurement, ss.series)
+			s.index.CreateSeriesIndexIfNotExists(ss.Measurement, ss.Series)
 		}
 		s.index.mu.Unlock()
 	}
@ -239,262 +171,36 @@ func (s *Shard) WritePoints(points []Point) error {

 	// make sure all data is encoded before attempting to save to bolt
 	for _, p := range points {
-		// marshal the raw data if it hasn't been marshaled already
-		if p.Data() == nil {
-			// this was populated earlier, don't need to validate that it's there.
-			s.mu.RLock()
-			mf := s.measurementFields[p.Name()]
-			s.mu.RUnlock()
-
-			// If a measurement is dropped while writes for it are in progress, this could be nil
-			if mf == nil {
-				return ErrFieldNotFound
-			}
-
-			data, err := mf.codec.EncodeFields(p.Fields())
-			if err != nil {
-				return err
-			}
-			p.SetData(data)
+		// Ignore if raw data has already been marshaled.
+		if p.Data() != nil {
+			continue
 		}
+
+		// This was populated earlier, don't need to validate that it's there.
+		s.mu.RLock()
+		mf := s.measurementFields[p.Name()]
+		s.mu.RUnlock()
+
+		// If a measurement is dropped while writes for it are in progress, this could be nil
+		if mf == nil {
+			return ErrFieldNotFound
+		}
+
+		data, err := mf.Codec.EncodeFields(p.Fields())
+		if err != nil {
+			return err
+		}
+		p.SetData(data)
 	}

-	// save to the underlying bolt instance
-	if err := s.db.Update(func(tx *bolt.Tx) error {
-		// save any new metadata
-		if len(seriesToCreate) > 0 {
-			b := tx.Bucket([]byte("series"))
-			for _, sc := range seriesToCreate {
-				data, err := sc.series.MarshalBinary()
-				if err != nil {
-					return err
-				}
-				if err := b.Put([]byte(sc.series.Key), data); err != nil {
-					return err
-				}
-			}
-		}
-		if len(measurementFieldsToSave) > 0 {
-			b := tx.Bucket([]byte("fields"))
-			for name, m := range measurementFieldsToSave {
-				data, err := m.MarshalBinary()
-				if err != nil {
-					return err
-				}
-				if err := b.Put([]byte(name), data); err != nil {
-					return err
-				}
-			}
-		}
-
-		// Write points to WAL bucket.
-		wal := tx.Bucket([]byte("wal"))
-		for _, p := range points {
-			// Retrieve partition bucket.
-			key := p.Key()
-			b, err := wal.CreateBucketIfNotExists([]byte{WALPartition(key)})
-			if err != nil {
-				return fmt.Errorf("create WAL partition bucket: %s", err)
-			}
-
-			// Generate an autoincrementing index for the WAL partition.
-			id, _ := b.NextSequence()
-
-			// Append points sequentially to the WAL bucket.
-			v := marshalWALEntry(key, p.UnixNano(), p.Data())
-			if err := b.Put(u64tob(id), v); err != nil {
-				return fmt.Errorf("put wal: %s", err)
-			}
-		}
-
-		return nil
-	}); err != nil {
-		return err
-	}
-
-	// If successful then save points to in-memory cache.
-	if err := func() error {
-		s.mu.Lock()
-		defer s.mu.Unlock()
-
-		// tracks which in-memory caches need to be resorted
-		resorts := map[uint8]map[string]struct{}{}
-
-		for _, p := range points {
-			// Generate in-memory cache entry of <timestamp,data>.
-			key, data := p.Key(), p.Data()
-			v := make([]byte, 8+len(data))
-			binary.BigEndian.PutUint64(v[0:8], uint64(p.UnixNano()))
-			copy(v[8:], data)
-
-			// Determine if we are appending.
-			partitionID := WALPartition(key)
-			a := s.cache[partitionID][string(key)]
-			appending := (len(a) == 0 || bytes.Compare(a[len(a)-1], v) == -1)
-
-			// Append to cache list.
-			a = append(a, v)
-
-			// If not appending, keep track of cache lists that need to be resorted.
-			if !appending {
-				series := resorts[partitionID]
-				if series == nil {
-					series = map[string]struct{}{}
-					resorts[partitionID] = series
-				}
-				series[string(key)] = struct{}{}
-			}
-
-			s.cache[partitionID][string(key)] = a
-
-			// Calculate estimated WAL size.
-			s.walSize += len(key) + len(v)
-		}
-
-		// Sort by timestamp if not appending.
-		for partitionID, cache := range resorts {
-			for key, _ := range cache {
-				sort.Sort(byteSlices(s.cache[partitionID][key]))
-			}
-		}
-
-		// Check for flush threshold.
-		s.triggerAutoFlush()
-
-		return nil
-	}(); err != nil {
-		return err
+	// Write to the engine.
+	if err := s.engine.WritePoints(points, measurementFieldsToSave, seriesToCreate); err != nil {
+		return fmt.Errorf("engine: %s", err)
 	}

 	return nil
 }

-// Flush writes all points from the write ahead log to the index.
-func (s *Shard) Flush(partitionFlushDelay time.Duration) error {
-	// Retrieve a list of WAL buckets.
-	var partitionIDs []uint8
-	if err := s.db.View(func(tx *bolt.Tx) error {
-		return tx.Bucket([]byte("wal")).ForEach(func(key, _ []byte) error {
-			partitionIDs = append(partitionIDs, uint8(key[0]))
-			return nil
-		})
-	}); err != nil {
-		return err
-	}
-
-	// Continue flushing until there are no more partition buckets.
-	for _, partitionID := range partitionIDs {
-		if err := s.FlushPartition(partitionID); err != nil {
-			return fmt.Errorf("flush partition: id=%d, err=%s", partitionID, err)
-		}
-
-		// Wait momentarily so other threads can process.
-		time.Sleep(partitionFlushDelay)
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	// Reset WAL size.
-	s.walSize = 0
-
-	// Reset the timer.
-	s.flushTimer.Reset(s.WALFlushInterval)
-
-	return nil
-}
-
-// FlushPartition flushes a single WAL partition.
-func (s *Shard) FlushPartition(partitionID uint8) error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	startTime := time.Now()
-
-	var pointN int
-	if err := s.db.Update(func(tx *bolt.Tx) error {
-		// Retrieve partition bucket. Exit if it doesn't exist.
-		pb := tx.Bucket([]byte("wal")).Bucket([]byte{byte(partitionID)})
-		if pb == nil {
-			return ErrWALPartitionNotFound
-		}
-
-		// Iterate over keys in the WAL partition bucket.
-		c := pb.Cursor()
-		for k, v := c.First(); k != nil; k, v = c.Next() {
-			key, timestamp, data := unmarshalWALEntry(v)
-
-			// Create bucket for entry.
-			b, err := tx.CreateBucketIfNotExists(key)
-			if err != nil {
-				return fmt.Errorf("create bucket: %s", err)
-			}
-
-			// Write point to bucket.
-			if err := b.Put(u64tob(uint64(timestamp)), data); err != nil {
-				return fmt.Errorf("put: %s", err)
-			}
-
-			// Remove entry in the WAL.
-			if err := c.Delete(); err != nil {
-				return fmt.Errorf("delete: %s", err)
-			}
-
-			pointN++
-		}
-
-		return nil
-	}); err != nil {
-		return err
-	}
-
-	// Reset cache.
-	s.cache[partitionID] = make(map[string][][]byte)
-
-	if pointN > 0 {
-		s.logger.Printf("flush %d points in %.3fs", pointN, time.Since(startTime).Seconds())
-	}
-
-	return nil
-}
-
-// autoflusher waits for notification of a flush and kicks it off in the background.
-// This method runs in a separate goroutine.
-func (s *Shard) autoflusher(closing chan struct{}) {
-	defer s.wg.Done()
-
-	for {
-		// Wait for close or flush signal.
-		select {
-		case <-closing:
-			return
-		case <-s.flushTimer.C:
-			if err := s.Flush(s.WALPartitionFlushDelay); err != nil {
-				s.logger.Printf("flush error: %s", err)
-			}
-		case <-s.flush:
-			if err := s.Flush(s.WALPartitionFlushDelay); err != nil {
-				s.logger.Printf("flush error: %s", err)
-			}
-		}
-	}
-}
-
-// triggerAutoFlush signals that a flush should occur if the size is above the threshold.
-// This function must be called within the context of a lock.
-func (s *Shard) triggerAutoFlush() {
-	// Ignore if we haven't reached the threshold.
-	if s.walSize < s.MaxWALSize {
-		return
-	}
-
-	// Otherwise send a non-blocking signal.
-	select {
-	case s.flush <- struct{}{}:
-	default:
-	}
-}
-
 func (s *Shard) ValidateAggregateFieldsInStatement(measurementName string, stmt *influxql.SelectStatement) error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@ -547,62 +253,27 @@ func (s *Shard) ValidateAggregateFieldsInStatement(measurementName string, stmt
 	return nil
 }

-// deleteSeries deletes the buckets and the metadata for the given series keys
-func (s *Shard) deleteSeries(keys []string) error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	if err := s.db.Update(func(tx *bolt.Tx) error {
-		b := tx.Bucket([]byte("series"))
-		for _, k := range keys {
-			if err := b.Delete([]byte(k)); err != nil {
-				return err
-			}
-			if err := tx.DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
-				return err
-			}
-			delete(s.cache[WALPartition([]byte(k))], k)
-		}
-		return nil
-	}); err != nil {
-		return err
-	}
-
-	return nil
+// DeleteSeries deletes a list of series.
+func (s *Shard) DeleteSeries(keys []string) error {
+	return s.engine.DeleteSeries(keys)
 }

-// deleteMeasurement deletes the measurement field encoding information and all underlying series from the shard
-func (s *Shard) deleteMeasurement(name string, seriesKeys []string) error {
+// DeleteMeasurement deletes a measurement and all underlying series.
+func (s *Shard) DeleteMeasurement(name string, seriesKeys []string) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

-	if err := s.db.Update(func(tx *bolt.Tx) error {
-		bm := tx.Bucket([]byte("fields"))
-		if err := bm.Delete([]byte(name)); err != nil {
-			return err
-		}
-		b := tx.Bucket([]byte("series"))
-		for _, k := range seriesKeys {
-			if err := b.Delete([]byte(k)); err != nil {
-				return err
-			}
-			if err := tx.DeleteBucket([]byte(k)); err != nil && err != bolt.ErrBucketNotFound {
-				return err
-			}
-			delete(s.cache[WALPartition([]byte(k))], k)
-		}
-
-		return nil
-	}); err != nil {
+	if err := s.engine.DeleteMeasurement(name, seriesKeys); err != nil {
 		return err
 	}

 	// Remove entry from shard index.
 	delete(s.measurementFields, name)
+
 	return nil
 }

-func (s *Shard) createFieldsAndMeasurements(fieldsToCreate []*fieldCreate) (map[string]*measurementFields, error) {
+func (s *Shard) createFieldsAndMeasurements(fieldsToCreate []*FieldCreate) (map[string]*MeasurementFields, error) {
 	if len(fieldsToCreate) == 0 {
 		return nil, nil
 	}
@ -613,37 +284,37 @@ func (s *Shard) createFieldsAndMeasurements(fieldsToCreate []*fieldCreate) (map[
 	defer s.mu.Unlock()

 	// add fields
-	measurementsToSave := make(map[string]*measurementFields)
+	measurementsToSave := make(map[string]*MeasurementFields)
 	for _, f := range fieldsToCreate {

-		m := s.measurementFields[f.measurement]
+		m := s.measurementFields[f.Measurement]
 		if m == nil {
-			m = measurementsToSave[f.measurement]
+			m = measurementsToSave[f.Measurement]
 			if m == nil {
-				m = &measurementFields{Fields: make(map[string]*field)}
+				m = &MeasurementFields{Fields: make(map[string]*Field)}
 			}
-			s.measurementFields[f.measurement] = m
+			s.measurementFields[f.Measurement] = m
 		}

-		measurementsToSave[f.measurement] = m
+		measurementsToSave[f.Measurement] = m

 		// add the field to the in memory index
-		if err := m.createFieldIfNotExists(f.field.Name, f.field.Type); err != nil {
+		if err := m.CreateFieldIfNotExists(f.Field.Name, f.Field.Type); err != nil {
 			return nil, err
 		}

 		// ensure the measurement is in the index and the field is there
-		measurement := s.index.createMeasurementIndexIfNotExists(f.measurement)
-		measurement.fieldNames[f.field.Name] = struct{}{}
+		measurement := s.index.CreateMeasurementIndexIfNotExists(f.Measurement)
+		measurement.fieldNames[f.Field.Name] = struct{}{}
 	}

 	return measurementsToSave, nil
 }

 // validateSeriesAndFields checks which series and fields are new and whose metadata should be saved and indexed
-func (s *Shard) validateSeriesAndFields(points []Point) ([]*seriesCreate, []*fieldCreate, error) {
-	var seriesToCreate []*seriesCreate
-	var fieldsToCreate []*fieldCreate
+func (s *Shard) validateSeriesAndFields(points []Point) ([]*SeriesCreate, []*FieldCreate, error) {
+	var seriesToCreate []*SeriesCreate
+	var fieldsToCreate []*FieldCreate

 	// get the mutex for the in memory index, which is shared across shards
 	s.index.mu.RLock()
@ -657,14 +328,14 @@ func (s *Shard) validateSeriesAndFields(points []Point) ([]*seriesCreate, []*fie
 		// see if the series should be added to the index
 		if ss := s.index.series[string(p.Key())]; ss == nil {
 			series := &Series{Key: string(p.Key()), Tags: p.Tags()}
-			seriesToCreate = append(seriesToCreate, &seriesCreate{p.Name(), series})
+			seriesToCreate = append(seriesToCreate, &SeriesCreate{p.Name(), series})
 		}

 		// see if the field definitions need to be saved to the shard
 		mf := s.measurementFields[p.Name()]
 		if mf == nil {
 			for name, value := range p.Fields() {
-				fieldsToCreate = append(fieldsToCreate, &fieldCreate{p.Name(), &field{Name: name, Type: influxql.InspectDataType(value)}})
+				fieldsToCreate = append(fieldsToCreate, &FieldCreate{p.Name(), &Field{Name: name, Type: influxql.InspectDataType(value)}})
 			}
 			continue // skip validation since all fields are new
 		}
@ -680,72 +351,23 @@ func (s *Shard) validateSeriesAndFields(points []Point) ([]*seriesCreate, []*fie
 				continue // Field is present, and it's of the same type. Nothing more to do.
 			}

-			fieldsToCreate = append(fieldsToCreate, &fieldCreate{p.Name(), &field{Name: name, Type: influxql.InspectDataType(value)}})
+			fieldsToCreate = append(fieldsToCreate, &FieldCreate{p.Name(), &Field{Name: name, Type: influxql.InspectDataType(value)}})
 		}
 	}

 	return seriesToCreate, fieldsToCreate, nil
 }

-// loadsMetadataIndex loads the shard metadata into memory. This should only be called by Open
-func (s *Shard) loadMetadataIndex() error {
-	return s.db.View(func(tx *bolt.Tx) error {
-		s.index.mu.Lock()
-		defer s.index.mu.Unlock()
-
-		// load measurement metadata
-		meta := tx.Bucket([]byte("fields"))
-		c := meta.Cursor()
-		for k, v := c.First(); k != nil; k, v = c.Next() {
-			m := s.index.createMeasurementIndexIfNotExists(string(k))
-			mf := &measurementFields{}
-			if err := mf.UnmarshalBinary(v); err != nil {
-				return err
-			}
-			for name, _ := range mf.Fields {
-				m.fieldNames[name] = struct{}{}
-			}
-			mf.codec = newFieldCodec(mf.Fields)
-			s.measurementFields[m.Name] = mf
-		}
-
-		// load series metadata
-		meta = tx.Bucket([]byte("series"))
-		c = meta.Cursor()
-		for k, v := c.First(); k != nil; k, v = c.Next() {
-			series := &Series{}
-			if err := series.UnmarshalBinary(v); err != nil {
-				return err
-			}
-			s.index.createSeriesIndexIfNotExists(measurementFromSeriesKey(string(k)), series)
-		}
-		return nil
-	})
-}
-
 // SeriesCount returns the number of series buckets on the shard.
-// This does not include a count from the WAL.
-func (s *Shard) SeriesCount() (n int, err error) {
-	err = s.db.View(func(tx *bolt.Tx) error {
-		return tx.ForEach(func(_ []byte, _ *bolt.Bucket) error {
-			n++
-			return nil
-		})
-	})
+func (s *Shard) SeriesCount() (int, error) { return s.engine.SeriesCount() }

-	// Remove top-level buckets.
-	n -= topLevelBucketN
-
-	return
-}
-
-type measurementFields struct {
-	Fields map[string]*field `json:"fields"`
-	codec  *FieldCodec
+type MeasurementFields struct {
+	Fields map[string]*Field `json:"fields"`
+	Codec  *FieldCodec
 }

 // MarshalBinary encodes the object to a binary format.
-func (m *measurementFields) MarshalBinary() ([]byte, error) {
+func (m *MeasurementFields) MarshalBinary() ([]byte, error) {
 	var pb internal.MeasurementFields
 	for _, f := range m.Fields {
 		id := int32(f.ID)
@ -757,22 +379,22 @@ func (m *measurementFields) MarshalBinary() ([]byte, error) {
 }

 // UnmarshalBinary decodes the object from a binary format.
-func (m *measurementFields) UnmarshalBinary(buf []byte) error {
+func (m *MeasurementFields) UnmarshalBinary(buf []byte) error {
 	var pb internal.MeasurementFields
 	if err := proto.Unmarshal(buf, &pb); err != nil {
 		return err
 	}
-	m.Fields = make(map[string]*field)
+	m.Fields = make(map[string]*Field)
 	for _, f := range pb.Fields {
-		m.Fields[f.GetName()] = &field{ID: uint8(f.GetID()), Name: f.GetName(), Type: influxql.DataType(f.GetType())}
+		m.Fields[f.GetName()] = &Field{ID: uint8(f.GetID()), Name: f.GetName(), Type: influxql.DataType(f.GetType())}
 	}
 	return nil
 }

-// createFieldIfNotExists creates a new field with an autoincrementing ID.
+// CreateFieldIfNotExists creates a new field with an autoincrementing ID.
 // Returns an error if 255 fields have already been created on the measurement or
 // the fields already exists with a different type.
-func (m *measurementFields) createFieldIfNotExists(name string, typ influxql.DataType) error {
+func (m *MeasurementFields) CreateFieldIfNotExists(name string, typ influxql.DataType) error {
 	// Ignore if the field already exists.
 	if f := m.Fields[name]; f != nil {
 		if f.Type != typ {
@ -787,19 +409,19 @@ func (m *measurementFields) createFieldIfNotExists(name string, typ influxql.Dat
 	}

 	// Create and append a new field.
-	f := &field{
+	f := &Field{
 		ID:   uint8(len(m.Fields) + 1),
 		Name: name,
 		Type: typ,
 	}
 	m.Fields[name] = f
-	m.codec = newFieldCodec(m.Fields)
+	m.Codec = NewFieldCodec(m.Fields)

 	return nil
 }

 // Field represents a series field.
-type field struct {
+type Field struct {
 	ID   uint8             `json:"id,omitempty"`
 	Name string            `json:"name,omitempty"`
 	Type influxql.DataType `json:"type,omitempty"`
@ -813,15 +435,15 @@ type field struct {
 // TODO: this shouldn't be exported. nothing outside the shard should know about field encodings.
 //       However, this is here until tx.go and the engine get refactored into tsdb.
 type FieldCodec struct {
-	fieldsByID   map[uint8]*field
-	fieldsByName map[string]*field
+	fieldsByID   map[uint8]*Field
+	fieldsByName map[string]*Field
 }

 // NewFieldCodec returns a FieldCodec for the given Measurement. Must be called with
 // a RLock that protects the Measurement.
-func newFieldCodec(fields map[string]*field) *FieldCodec {
-	fieldsByID := make(map[uint8]*field, len(fields))
-	fieldsByName := make(map[string]*field, len(fields))
+func NewFieldCodec(fields map[string]*Field) *FieldCodec {
+	fieldsByID := make(map[uint8]*Field, len(fields))
+	fieldsByName := make(map[string]*Field, len(fields))
 	for _, f := range fields {
 		fieldsByID[f.ID] = f
 		fieldsByName[f.Name] = f
@ -1045,15 +667,15 @@ func (f *FieldCodec) DecodeByID(targetID uint8, b []byte) (interface{}, error) {
 // DecodeByName scans a byte slice for a field with the given name, converts it to its
 // expected type, and return that value.
 func (f *FieldCodec) DecodeByName(name string, b []byte) (interface{}, error) {
-	if fi := f.fieldByName(name); fi == nil {
+	fi := f.fieldByName(name)
+	if fi == nil {
 		return 0, ErrFieldNotFound
-	} else {
-		return f.DecodeByID(fi.ID, b)
 	}
+	return f.DecodeByID(fi.ID, b)
 }

 // FieldByName returns the field by its name. It will return a nil if not found
-func (f *FieldCodec) fieldByName(name string) *field {
+func (f *FieldCodec) fieldByName(name string) *Field {
 	return f.fieldsByName[name]
 }

@ -1083,136 +705,3 @@ func u64tob(v uint64) []byte {
 	binary.BigEndian.PutUint64(b, v)
 	return b
 }
-
-// marshalWALEntry encodes point data into a single byte slice.
-//
-// The format of the byte slice is:
-//
-//     uint64 timestamp
-//     uint32 key length
-//     []byte key
-//     []byte data
-//
-func marshalWALEntry(key []byte, timestamp int64, data []byte) []byte {
-	v := make([]byte, 8+4, 8+4+len(key)+len(data))
-	binary.BigEndian.PutUint64(v[0:8], uint64(timestamp))
-	binary.BigEndian.PutUint32(v[8:12], uint32(len(key)))
-	v = append(v, key...)
-	v = append(v, data...)
-	return v
-}
-
-// unmarshalWALEntry decodes a WAL entry into it's separate parts.
-// Returned byte slices point to the original slice.
-func unmarshalWALEntry(v []byte) (key []byte, timestamp int64, data []byte) {
-	keyLen := binary.BigEndian.Uint32(v[8:12])
-	key = v[12 : 12+keyLen]
-	timestamp = int64(binary.BigEndian.Uint64(v[0:8]))
-	data = v[12+keyLen:]
-	return
-}
-
-// marshalCacheEntry encodes the timestamp and data to a single byte slice.
-//
-// The format of the byte slice is:
-//
-//     uint64 timestamp
-//     []byte data
-//
-func marshalCacheEntry(timestamp int64, data []byte) []byte {
-	buf := make([]byte, 8, 8+len(data))
-	binary.BigEndian.PutUint64(buf[0:8], uint64(timestamp))
-	return append(buf, data...)
-}
-
-// unmarshalCacheEntry returns the timestamp and data from an encoded byte slice.
-func unmarshalCacheEntry(buf []byte) (timestamp int64, data []byte) {
-	timestamp = int64(binary.BigEndian.Uint64(buf[0:8]))
-	data = buf[8:]
-	return
-}
-
-// byteSlices represents a sortable slice of byte slices.
-type byteSlices [][]byte
-
-func (a byteSlices) Len() int           { return len(a) }
-func (a byteSlices) Less(i, j int) bool { return bytes.Compare(a[i], a[j]) == -1 }
-func (a byteSlices) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
-
-// shardCursor provides ordered iteration across a Bolt bucket and shard cache.
-type shardCursor struct {
-	// Bolt cursor and readahead buffer.
-	cursor *bolt.Cursor
-	buf    struct {
-		key, value []byte
-	}
-
-	// Cache and current cache index.
-	cache [][]byte
-	index int
-}
-
-// Seek moves the cursor to a position and returns the closest key/value pair.
-func (sc *shardCursor) Seek(seek []byte) (key, value []byte) {
-	// Seek bolt cursor.
-	if sc.cursor != nil {
-		sc.buf.key, sc.buf.value = sc.cursor.Seek(seek)
-	}
-
-	// Seek cache index.
-	sc.index = sort.Search(len(sc.cache), func(i int) bool {
-		return bytes.Compare(sc.cache[i][0:8], seek) != -1
-	})
-
-	return sc.read()
-}
-
-// Next returns the next key/value pair from the cursor.
-func (sc *shardCursor) Next() (key, value []byte) {
-	// Read next bolt key/value if not bufferred.
-	if sc.buf.key == nil && sc.cursor != nil {
-		sc.buf.key, sc.buf.value = sc.cursor.Next()
-	}
-
-	return sc.read()
-}
-
-// read returns the next key/value in the cursor buffer or cache.
-func (sc *shardCursor) read() (key, value []byte) {
-	// If neither a buffer or cache exists then return nil.
-	if sc.buf.key == nil && sc.index >= len(sc.cache) {
-		return nil, nil
-	}
-
-	// Use the buffer if it exists and there's no cache or if it is lower than the cache.
-	if sc.buf.key != nil && (sc.index >= len(sc.cache) || bytes.Compare(sc.buf.key, sc.cache[sc.index][0:8]) == -1) {
-		key, value = sc.buf.key, sc.buf.value
-		sc.buf.key, sc.buf.value = nil, nil
-		return
-	}
-
-	// Otherwise read from the cache.
-	// Continue skipping ahead through duplicate keys in the cache list.
-	for {
-		// Read the current cache key/value pair.
-		key, value = sc.cache[sc.index][0:8], sc.cache[sc.index][8:]
-		sc.index++
-
-		// Exit loop if we're at the end of the cache or the next key is different.
-		if sc.index >= len(sc.cache) || !bytes.Equal(key, sc.cache[sc.index][0:8]) {
-			break
-		}
-	}
-
-	return
-}
-
-// WALPartitionN is the number of partitions in the write ahead log.
-const WALPartitionN = 8
-
-// WALPartition returns the partition number that key belongs to.
-func WALPartition(key []byte) uint8 {
-	h := fnv.New64a()
-	h.Write(key)
-	return uint8(h.Sum64() % WALPartitionN)
-}
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/shard_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/shard_test.go
@ -1,4 +1,4 @@
-package tsdb
+package tsdb_test

 import (
 	"fmt"
@ -9,6 +9,8 @@ import (
 	"reflect"
 	"testing"
 	"time"
+
+	"github.com/influxdb/influxdb/tsdb"
 )

 func TestShardWriteAndIndex(t *testing.T) {
@ -16,42 +18,43 @@ func TestShardWriteAndIndex(t *testing.T) {
 	defer os.RemoveAll(tmpDir)
 	tmpShard := path.Join(tmpDir, "shard")

-	index := NewDatabaseIndex()
-	sh := NewShard(index, tmpShard)
+	index := tsdb.NewDatabaseIndex()
+	sh := tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 	if err := sh.Open(); err != nil {
 		t.Fatalf("error openeing shard: %s", err.Error())
 	}

-	pt := NewPoint(
+	pt := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)

-	err := sh.WritePoints([]Point{pt})
+	err := sh.WritePoints([]tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

 	pt.SetTime(time.Unix(2, 3))
-	err = sh.WritePoints([]Point{pt})
+	err = sh.WritePoints([]tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

 	validateIndex := func() {
-		if !reflect.DeepEqual(index.names, []string{"cpu"}) {
+		if !reflect.DeepEqual(index.Names(), []string{"cpu"}) {
 			t.Fatalf("measurement names in shard didn't match")
 		}
-		if len(index.series) != 1 {
+		if index.SeriesN() != 1 {
 			t.Fatalf("series wasn't in index")
 		}
-		seriesTags := index.series[string(pt.Key())].Tags
+
+		seriesTags := index.Series(string(pt.Key())).Tags
 		if len(seriesTags) != len(pt.Tags()) || pt.Tags()["host"] != seriesTags["host"] {
-			t.Fatalf("tags weren't properly saved to series index: %v, %v", pt.Tags(), index.series[string(pt.Key())].Tags)
+			t.Fatalf("tags weren't properly saved to series index: %v, %v", pt.Tags(), seriesTags)
 		}
-		if !reflect.DeepEqual(index.measurements["cpu"].TagKeys(), []string{"host"}) {
+		if !reflect.DeepEqual(index.Measurement("cpu").TagKeys(), []string{"host"}) {
 			t.Fatalf("tag key wasn't saved to measurement index")
 		}
 	}
@ -61,8 +64,8 @@ func TestShardWriteAndIndex(t *testing.T) {
 	// ensure the index gets loaded after closing and opening the shard
 	sh.Close()

-	index = NewDatabaseIndex()
-	sh = NewShard(index, tmpShard)
+	index = tsdb.NewDatabaseIndex()
+	sh = tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 	if err := sh.Open(); err != nil {
 		t.Fatalf("error openeing shard: %s", err.Error())
 	}
@ -71,7 +74,7 @@ func TestShardWriteAndIndex(t *testing.T) {

 	// and ensure that we can still write data
 	pt.SetTime(time.Unix(2, 6))
-	err = sh.WritePoints([]Point{pt})
+	err = sh.WritePoints([]tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}
@ -82,52 +85,52 @@ func TestShardWriteAddNewField(t *testing.T) {
 	defer os.RemoveAll(tmpDir)
 	tmpShard := path.Join(tmpDir, "shard")

-	index := NewDatabaseIndex()
-	sh := NewShard(index, tmpShard)
+	index := tsdb.NewDatabaseIndex()
+	sh := tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 	if err := sh.Open(); err != nil {
 		t.Fatalf("error openeing shard: %s", err.Error())
 	}
 	defer sh.Close()

-	pt := NewPoint(
+	pt := tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0},
 		time.Unix(1, 2),
 	)

-	err := sh.WritePoints([]Point{pt})
+	err := sh.WritePoints([]tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

-	pt = NewPoint(
+	pt = tsdb.NewPoint(
 		"cpu",
 		map[string]string{"host": "server"},
 		map[string]interface{}{"value": 1.0, "value2": 2.0},
 		time.Unix(1, 2),
 	)

-	err = sh.WritePoints([]Point{pt})
+	err = sh.WritePoints([]tsdb.Point{pt})
 	if err != nil {
 		t.Fatalf(err.Error())
 	}

-	if !reflect.DeepEqual(index.names, []string{"cpu"}) {
+	if !reflect.DeepEqual(index.Names(), []string{"cpu"}) {
 		t.Fatalf("measurement names in shard didn't match")
 	}
-	if len(index.series) != 1 {
+	if index.SeriesN() != 1 {
 		t.Fatalf("series wasn't in index")
 	}
-	seriesTags := index.series[string(pt.Key())].Tags
+	seriesTags := index.Series(string(pt.Key())).Tags
 	if len(seriesTags) != len(pt.Tags()) || pt.Tags()["host"] != seriesTags["host"] {
-		t.Fatalf("tags weren't properly saved to series index: %v, %v", pt.Tags(), index.series[string(pt.Key())].Tags)
+		t.Fatalf("tags weren't properly saved to series index: %v, %v", pt.Tags(), seriesTags)
 	}
-	if !reflect.DeepEqual(index.measurements["cpu"].TagKeys(), []string{"host"}) {
+	if !reflect.DeepEqual(index.Measurement("cpu").TagKeys(), []string{"host"}) {
 		t.Fatalf("tag key wasn't saved to measurement index")
 	}

-	if len(index.measurements["cpu"].FieldNames()) != 2 {
+	if len(index.Measurement("cpu").FieldNames()) != 2 {
 		t.Fatalf("field names wasn't saved to measurement index")
 	}

@ -139,10 +142,11 @@ func TestShard_Autoflush(t *testing.T) {
 	defer os.RemoveAll(path)

 	// Open shard with a really low size threshold, high flush interval.
-	sh := NewShard(NewDatabaseIndex(), filepath.Join(path, "shard"))
-	sh.MaxWALSize = 1024 // 1KB
-	sh.WALFlushInterval = 1 * time.Hour
-	sh.WALPartitionFlushDelay = 1 * time.Millisecond
+	sh := tsdb.NewShard(tsdb.NewDatabaseIndex(), filepath.Join(path, "shard"), tsdb.EngineOptions{
+		MaxWALSize:             1024, // 1KB
+		WALFlushInterval:       1 * time.Hour,
+		WALPartitionFlushDelay: 1 * time.Millisecond,
+	})
 	if err := sh.Open(); err != nil {
 		t.Fatal(err)
 	}
@ -150,7 +154,7 @@ func TestShard_Autoflush(t *testing.T) {

 	// Write a bunch of points.
 	for i := 0; i < 100; i++ {
-		if err := sh.WritePoints([]Point{NewPoint(
+		if err := sh.WritePoints([]tsdb.Point{tsdb.NewPoint(
 			fmt.Sprintf("cpu%d", i),
 			map[string]string{"host": "server"},
 			map[string]interface{}{"value": 1.0},
@ -177,10 +181,11 @@ func TestShard_Autoflush_FlushInterval(t *testing.T) {
 	defer os.RemoveAll(path)

 	// Open shard with a high size threshold, small time threshold.
-	sh := NewShard(NewDatabaseIndex(), filepath.Join(path, "shard"))
-	sh.MaxWALSize = 10 * 1024 * 1024 // 10MB
-	sh.WALFlushInterval = 100 * time.Millisecond
-	sh.WALPartitionFlushDelay = 1 * time.Millisecond
+	sh := tsdb.NewShard(tsdb.NewDatabaseIndex(), filepath.Join(path, "shard"), tsdb.EngineOptions{
+		MaxWALSize:             10 * 1024 * 1024, // 10MB
+		WALFlushInterval:       100 * time.Millisecond,
+		WALPartitionFlushDelay: 1 * time.Millisecond,
+	})
 	if err := sh.Open(); err != nil {
 		t.Fatal(err)
 	}
@ -188,7 +193,7 @@ func TestShard_Autoflush_FlushInterval(t *testing.T) {

 	// Write some points.
 	for i := 0; i < 100; i++ {
-		if err := sh.WritePoints([]Point{NewPoint(
+		if err := sh.WritePoints([]tsdb.Point{tsdb.NewPoint(
 			fmt.Sprintf("cpu%d", i),
 			map[string]string{"host": "server"},
 			map[string]interface{}{"value": 1.0},
@ -240,12 +245,12 @@ func benchmarkWritePoints(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt int) {
 	// Generate test series (measurements + unique tag sets).
 	series := genTestSeries(mCnt, tkCnt, tvCnt)
 	// Create index for the shard to use.
-	index := NewDatabaseIndex()
+	index := tsdb.NewDatabaseIndex()
 	// Generate point data to write to the shard.
-	points := []Point{}
+	points := []tsdb.Point{}
 	for _, s := range series {
 		for val := 0.0; val < float64(pntCnt); val++ {
-			p := NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
+			p := tsdb.NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
 			points = append(points, p)
 		}
 	}
@ -258,7 +263,7 @@ func benchmarkWritePoints(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt int) {
 	for n := 0; n < b.N; n++ {
 		tmpDir, _ := ioutil.TempDir("", "shard_test")
 		tmpShard := path.Join(tmpDir, "shard")
-		shard := NewShard(index, tmpShard)
+		shard := tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 		shard.Open()

 		b.StartTimer()
@ -280,12 +285,12 @@ func benchmarkWritePointsExistingSeries(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt
 	// Generate test series (measurements + unique tag sets).
 	series := genTestSeries(mCnt, tkCnt, tvCnt)
 	// Create index for the shard to use.
-	index := NewDatabaseIndex()
+	index := tsdb.NewDatabaseIndex()
 	// Generate point data to write to the shard.
-	points := []Point{}
+	points := []tsdb.Point{}
 	for _, s := range series {
 		for val := 0.0; val < float64(pntCnt); val++ {
-			p := NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
+			p := tsdb.NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
 			points = append(points, p)
 		}
 	}
@ -293,7 +298,7 @@ func benchmarkWritePointsExistingSeries(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt
 	tmpDir, _ := ioutil.TempDir("", "")
 	defer os.RemoveAll(tmpDir)
 	tmpShard := path.Join(tmpDir, "shard")
-	shard := NewShard(index, tmpShard)
+	shard := tsdb.NewShard(index, tmpShard, tsdb.NewEngineOptions())
 	shard.Open()
 	defer shard.Close()
 	chunkedWrite(shard, points)
@ -314,7 +319,7 @@ func benchmarkWritePointsExistingSeries(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt
 	}
 }

-func chunkedWrite(shard *Shard, points []Point) {
+func chunkedWrite(shard *tsdb.Shard, points []tsdb.Point) {
 	nPts := len(points)
 	chunkSz := 10000
 	start := 0
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/store.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/store.go
@ -9,18 +9,15 @@ import (
 	"strconv"
 	"strings"
 	"sync"
-	"time"

 	"github.com/influxdb/influxdb/influxql"
 )

 func NewStore(path string) *Store {
 	return &Store{
-		path:                   path,
-		MaxWALSize:             DefaultMaxWALSize,
-		WALFlushInterval:       DefaultWALFlushInterval,
-		WALPartitionFlushDelay: DefaultWALPartitionFlushDelay,
-		Logger:                 log.New(os.Stderr, "[store] ", log.LstdFlags),
+		path:          path,
+		EngineOptions: NewEngineOptions(),
+		Logger:        log.New(os.Stderr, "[store] ", log.LstdFlags),
 	}
 }

@ -35,16 +32,34 @@ type Store struct {
 	databaseIndexes map[string]*DatabaseIndex
 	shards          map[uint64]*Shard

-	MaxWALSize             int
-	WALFlushInterval       time.Duration
-	WALPartitionFlushDelay time.Duration
-
-	Logger *log.Logger
+	EngineOptions EngineOptions
+	Logger        *log.Logger
 }

 // Path returns the store's root path.
 func (s *Store) Path() string { return s.path }

+// DatabaseIndexN returns the number of databases indicies in the store.
+func (s *Store) DatabaseIndexN() int {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return len(s.databaseIndexes)
+}
+
+// Shard returns a shard by id.
+func (s *Store) Shard(id uint64) *Shard {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.shards[id]
+}
+
+// ShardN returns the number of shard in the store.
+func (s *Store) ShardN() int {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return len(s.shards)
+}
+
 func (s *Store) CreateShard(database, retentionPolicy string, shardID uint64) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@ -67,7 +82,7 @@ func (s *Store) CreateShard(database, retentionPolicy string, shardID uint64) er
 	}

 	shardPath := filepath.Join(s.path, database, retentionPolicy, strconv.FormatUint(shardID, 10))
-	shard := s.newShard(db, shardPath)
+	shard := NewShard(db, shardPath, s.EngineOptions)
 	if err := shard.Open(); err != nil {
 		return err
 	}
@ -101,15 +116,6 @@ func (s *Store) DeleteShard(shardID uint64) error {
 	return nil
 }

-// newShard returns a shard and copies configuration settings from the store.
-func (s *Store) newShard(index *DatabaseIndex, path string) *Shard {
-	sh := NewShard(index, path)
-	sh.MaxWALSize = s.MaxWALSize
-	sh.WALFlushInterval = s.WALFlushInterval
-	sh.WALPartitionFlushDelay = s.WALPartitionFlushDelay
-	return sh
-}
-
 // DeleteDatabase will close all shards associated with a database and remove the directory and files from disk.
 func (s *Store) DeleteDatabase(name string, shardIDs []uint64) error {
 	s.mu.Lock()
@ -127,12 +133,6 @@ func (s *Store) DeleteDatabase(name string, shardIDs []uint64) error {
 	return nil
 }

-func (s *Store) Shard(shardID uint64) *Shard {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	return s.shards[shardID]
-}
-
 // ShardIDs returns a slice of all ShardIDs under management.
 func (s *Store) ShardIDs() []uint64 {
 	ids := make([]uint64, 0, len(s.shards))
@ -173,7 +173,7 @@ func (s *Store) deleteSeries(keys []string) error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, sh := range s.shards {
-		if err := sh.deleteSeries(keys); err != nil {
+		if err := sh.DeleteSeries(keys); err != nil {
 			return err
 		}
 	}
@ -185,7 +185,7 @@ func (s *Store) deleteMeasurement(name string, seriesKeys []string) error {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	for _, sh := range s.shards {
-		if err := sh.deleteMeasurement(name, seriesKeys); err != nil {
+		if err := sh.DeleteMeasurement(name, seriesKeys); err != nil {
 			return err
 		}
 	}
@ -236,8 +236,11 @@ func (s *Store) loadShards() error {
 					continue
 				}

-				shard := s.newShard(s.databaseIndexes[db], path)
-				shard.Open()
+				shard := NewShard(s.databaseIndexes[db], path, s.EngineOptions)
+				err = shard.Open()
+				if err != nil {
+					return fmt.Errorf("failed to open shard %d: %s", shardID, err)
+				}
 				s.shards[shardID] = shard
 			}
 		}
@ -253,6 +256,8 @@ func (s *Store) Open() error {
 	s.shards = map[uint64]*Shard{}
 	s.databaseIndexes = map[string]*DatabaseIndex{}

+	s.Logger.Printf("Using data dir: %v", s.Path())
+
 	// Create directory.
 	if err := os.MkdirAll(s.path, 0777); err != nil {
 		return err
@ -281,18 +286,6 @@ func (s *Store) WriteToShard(shardID uint64, points []Point) error {
 	return sh.WritePoints(points)
 }

-// Flush forces all shards to write their WAL data to the index.
-func (s *Store) Flush() error {
-	s.mu.RLock()
-	defer s.mu.RUnlock()
-	for shardID, sh := range s.shards {
-		if err := sh.Flush(s.WALPartitionFlushDelay); err != nil {
-			return fmt.Errorf("flush: shard=%d, err=%s", shardID, err)
-		}
-	}
-	return nil
-}
-
 func (s *Store) CreateMapper(shardID uint64, query string, chunkSize int) (Mapper, error) {
 	q, err := influxql.NewParser(strings.NewReader(query)).ParseStatement()
 	if err != nil {
@ -309,10 +302,7 @@ func (s *Store) CreateMapper(shardID uint64, query string, chunkSize int) (Mappe
 		return nil, nil
 	}

-	if (stmt.IsRawQuery && !stmt.HasDistinct()) || stmt.IsSimpleDerivative() {
-		return NewRawMapper(shard, stmt, chunkSize), nil
-	}
-	return NewAggMapper(shard, stmt), nil
+	return NewLocalMapper(shard, stmt, chunkSize), nil
 }

 func (s *Store) Close() error {
--- a/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/store_test.go
+++ b/Godeps/_workspace/src/github.com/influxdb/influxdb/tsdb/store_test.go
@ -1,4 +1,4 @@
-package tsdb
+package tsdb_test

 import (
 	"io/ioutil"
@ -6,6 +6,8 @@ import (
 	"path/filepath"
 	"testing"
 	"time"
+
+	"github.com/influxdb/influxdb/tsdb"
 )

 func TestStoreOpen(t *testing.T) {
@ -19,13 +21,13 @@ func TestStoreOpen(t *testing.T) {
 		t.Fatalf("failed to create test db dir: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 1; len(s.databaseIndexes) != exp {
-		t.Fatalf("database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 1; got != exp {
+		t.Fatalf("database index count mismatch: got %v, exp %v", got, exp)
 	}
 }

@ -46,26 +48,25 @@ func TestStoreOpenShard(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create test shard 1: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 1; len(s.databaseIndexes) != exp {
-		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 1; got != exp {
+		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", got, exp)
 	}

-	if _, ok := s.databaseIndexes["mydb"]; !ok {
+	if di := s.DatabaseIndex("mydb"); di == nil {
 		t.Errorf("Store.Open() database myb does not exist")
 	}

-	if exp := 1; len(s.shards) != exp {
-		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", len(s.shards), exp)
+	if got, exp := s.ShardN(), 1; got != exp {
+		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", got, exp)
 	}

-	sh := s.shards[uint64(1)]
-	if sh.path != shardPath {
-		t.Errorf("Store.Open() shard path mismatch: got %v, exp %v", sh.path, shardPath)
+	if sh := s.Shard(1); sh.Path() != shardPath {
+		t.Errorf("Store.Open() shard path mismatch: got %v, exp %v", sh.Path(), shardPath)
 	}
 }

@ -80,16 +81,16 @@ func TestStoreOpenShardCreateDelete(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create test db dir: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 1; len(s.databaseIndexes) != exp {
-		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 1; got != exp {
+		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", got, exp)
 	}

-	if _, ok := s.databaseIndexes["mydb"]; !ok {
+	if di := s.DatabaseIndex("mydb"); di == nil {
 		t.Errorf("Store.Open() database mydb does not exist")
 	}

@ -97,8 +98,8 @@ func TestStoreOpenShardCreateDelete(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create shard")
 	}

-	if exp := 1; len(s.shards) != exp {
-		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", len(s.shards), exp)
+	if got, exp := s.ShardN(), 1; got != exp {
+		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", got, exp)
 	}

 	shardIDs := s.ShardIDs()
@ -110,7 +111,7 @@ func TestStoreOpenShardCreateDelete(t *testing.T) {
 		t.Fatalf("Store.Open() failed to delete shard: %v", err)
 	}

-	if _, ok := s.shards[uint64(1)]; ok {
+	if sh := s.Shard(1); sh != nil {
 		t.Fatal("Store.Open() shard ID 1 still exists")
 	}
 }
@ -127,17 +128,17 @@ func TestStoreOpenNotDatabaseDir(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create test db dir: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 0; len(s.databaseIndexes) != exp {
-		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 0; got != exp {
+		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", got, exp)
 	}

-	if exp := 0; len(s.shards) != exp {
-		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", len(s.shards), exp)
+	if got, exp := s.ShardN(), 0; got != exp {
+		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", got, exp)
 	}
 }

@ -157,21 +158,21 @@ func TestStoreOpenNotRPDir(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create test retention policy directory: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 1; len(s.databaseIndexes) != exp {
-		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 1; got != exp {
+		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", got, exp)
 	}

-	if _, ok := s.databaseIndexes["mydb"]; !ok {
+	if di := s.DatabaseIndex("mydb"); di == nil {
 		t.Errorf("Store.Open() database myb does not exist")
 	}

-	if exp := 0; len(s.shards) != exp {
-		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", len(s.shards), exp)
+	if got, exp := s.ShardN(), 0; got != exp {
+		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", got, exp)
 	}
 }

@ -193,21 +194,21 @@ func TestStoreOpenShardBadShardPath(t *testing.T) {
 		t.Fatalf("Store.Open() failed to create test shard 1: %v", err)
 	}

-	s := NewStore(dir)
+	s := tsdb.NewStore(dir)
 	if err := s.Open(); err != nil {
 		t.Fatalf("Store.Open() failed: %v", err)
 	}

-	if exp := 1; len(s.databaseIndexes) != exp {
-		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", len(s.databaseIndexes), exp)
+	if got, exp := s.DatabaseIndexN(), 1; got != exp {
+		t.Fatalf("Store.Open() database index count mismatch: got %v, exp %v", got, exp)
 	}

-	if _, ok := s.databaseIndexes["mydb"]; !ok {
+	if di := s.DatabaseIndex("mydb"); di == nil {
 		t.Errorf("Store.Open() database myb does not exist")
 	}

-	if exp := 0; len(s.shards) != exp {
-		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", len(s.shards), exp)
+	if got, exp := s.ShardN(), 0; got != exp {
+		t.Fatalf("Store.Open() shard count mismatch: got %v, exp %v", got, exp)
 	}

 }
@ -218,17 +219,17 @@ func benchmarkStoreOpen(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt, shardCnt int)
 	// Generate test series (measurements + unique tag sets).
 	series := genTestSeries(mCnt, tkCnt, tvCnt)
 	// Generate point data to write to the shards.
-	points := []Point{}
+	points := []tsdb.Point{}
 	for _, s := range series {
 		for val := 0.0; val < float64(pntCnt); val++ {
-			p := NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
+			p := tsdb.NewPoint(s.Measurement, s.Series.Tags, map[string]interface{}{"value": val}, time.Now())
 			points = append(points, p)
 		}
 	}
 	// Create a temporary directory for the test data.
 	dir, _ := ioutil.TempDir("", "store_test")
 	// Create the store.
-	store := NewStore(dir)
+	store := tsdb.NewStore(dir)
 	// Open the store.
 	if err := store.Open(); err != nil {
 		b.Fatalf("benchmarkStoreOpen: %s", err)
@ -249,7 +250,7 @@ func benchmarkStoreOpen(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt, shardCnt int)
 	// Run the benchmark loop.
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
-		store := NewStore(dir)
+		store := tsdb.NewStore(dir)
 		if err := store.Open(); err != nil {
 			b.Fatalf("benchmarkStoreOpen: %s", err)
 		}
@ -260,7 +261,7 @@ func benchmarkStoreOpen(b *testing.B, mCnt, tkCnt, tvCnt, pntCnt, shardCnt int)
 	}
 }

-func chunkedWriteStoreShard(store *Store, shardID int, points []Point) {
+func chunkedWriteStoreShard(store *tsdb.Store, shardID int, points []tsdb.Point) {
 	nPts := len(points)
 	chunkSz := 10000
 	start := 0