exp/utf8string/string.go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package utf8string provides an efficient way to index strings by rune rather than by byte.
package utf8string

import (
	"errors"
	"unicode/utf8"
)

// String wraps a regular string with a small structure that provides more
// efficient indexing by code point index, as opposed to byte index.
// Scanning incrementally forwards or backwards is O(1) per index operation
// (although not as fast a range clause going forwards).  Random access is
// O(N) in the length of the string, but the overhead is less than always
// scanning from the beginning.
// If the string is ASCII, random access is O(1).
// Unlike the built-in string type, String has internal mutable state and
// is not thread-safe.
type String struct {
	str      string
	numRunes int
	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
	width    int
	bytePos  int
	runePos  int
	nonASCII int // byte index of the first non-ASCII rune.
}

// NewString returns a new UTF-8 string with the provided contents.
func NewString(contents string) *String {
	return new(String).Init(contents)
}

// Init initializes an existing String to hold the provided contents.
// It returns a pointer to the initialized String.
func (s *String) Init(contents string) *String {
	s.str = contents
	s.bytePos = 0
	s.runePos = 0
	for i := 0; i < len(contents); i++ {
		if contents[i] >= utf8.RuneSelf {
			// Not ASCII.
			s.numRunes = utf8.RuneCountInString(contents)
			_, s.width = utf8.DecodeRuneInString(contents)
			s.nonASCII = i
			return s
		}
	}
	// ASCII is simple.  Also, the empty string is ASCII.
	s.numRunes = len(contents)
	s.width = 0
	s.nonASCII = len(contents)
	return s
}

// String returns the contents of the String.  This method also means the
// String is directly printable by fmt.Print.
func (s *String) String() string {
	return s.str
}

// RuneCount returns the number of runes (Unicode code points) in the String.
func (s *String) RuneCount() int {
	return s.numRunes
}

// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
func (s *String) IsASCII() bool {
	return s.width == 0
}

// Slice returns the string sliced at rune positions [i:j].
func (s *String) Slice(i, j int) string {
	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
	if j < s.nonASCII {
		return s.str[i:j]
	}
	if i < 0 || j > s.numRunes || i > j {
		panic(sliceOutOfRange)
	}
	if i == j {
		return ""
	}
	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
	var low, high int
	switch {
	case i < s.nonASCII:
		low = i
	case i == s.numRunes:
		low = len(s.str)
	default:
		s.At(i)
		low = s.bytePos
	}
	switch {
	case j == s.numRunes:
		high = len(s.str)
	default:
		s.At(j)
		high = s.bytePos
	}
	return s.str[low:high]
}

// At returns the rune with index i in the String.  The sequence of runes is the same
// as iterating over the contents with a "for range" clause.
func (s *String) At(i int) rune {
	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
	if i < s.nonASCII {
		return rune(s.str[i])
	}

	// Now we do need to know the index is valid.
	if i < 0 || i >= s.numRunes {
		panic(outOfRange)
	}

	var r rune

	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
	// With these cases, all scans from beginning or end work in O(1) time per rune.
	switch {

	case i == s.runePos-1: // backing up one rune
		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
		s.runePos = i
		s.bytePos -= s.width
		return r
	case i == s.runePos+1: // moving ahead one rune
		s.runePos = i
		s.bytePos += s.width
		fallthrough
	case i == s.runePos:
		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
		return r
	case i == 0: // start of string
		r, s.width = utf8.DecodeRuneInString(s.str)
		s.runePos = 0
		s.bytePos = 0
		return r

	case i == s.numRunes-1: // last rune in string
		r, s.width = utf8.DecodeLastRuneInString(s.str)
		s.runePos = i
		s.bytePos = len(s.str) - s.width
		return r
	}

	// We need to do a linear scan.  There are three places to start from:
	// 1) The beginning
	// 2) bytePos/runePos.
	// 3) The end
	// Choose the closest in rune count, scanning backwards if necessary.
	forward := true
	if i < s.runePos {
		// Between beginning and pos.  Which is closer?
		// Since both i and runePos are guaranteed >= nonASCII, that's the
		// lowest location we need to start from.
		if i < (s.runePos-s.nonASCII)/2 {
			// Scan forward from beginning
			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
		} else {
			// Scan backwards from where we are
			forward = false
		}
	} else {
		// Between pos and end.  Which is closer?
		if i-s.runePos < (s.numRunes-s.runePos)/2 {
			// Scan forward from pos
		} else {
			// Scan backwards from end
			s.bytePos, s.runePos = len(s.str), s.numRunes
			forward = false
		}
	}
	if forward {
		// TODO: Is it much faster to use a range loop for this scan?
		for {
			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
			if s.runePos == i {
				break
			}
			s.runePos++
			s.bytePos += s.width
		}
	} else {
		for {
			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
			s.runePos--
			s.bytePos -= s.width
			if s.runePos == i {
				break
			}
		}
	}
	return r
}

var outOfRange = errors.New("utf8.String: index out of range")
var sliceOutOfRange = errors.New("utf8.String: slice index out of range")