Source File
scan.go
Belonging Package
bufio
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bufio
import (
)
// Scanner provides a convenient interface for reading data such as
// a file of newline-delimited lines of text. Successive calls to
// the [Scanner.Scan] method will step through the 'tokens' of a file, skipping
// the bytes between the tokens. The specification of a token is
// defined by a split function of type [SplitFunc]; the default split
// function breaks the input into lines with line termination stripped. [Scanner.Split]
// functions are defined in this package for scanning a file into
// lines, bytes, UTF-8-encoded runes, and space-delimited words. The
// client may instead provide a custom split function.
//
// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
// large to fit in the [Scanner.Buffer]. When a scan stops, the reader may have
// advanced arbitrarily far past the last token. Programs that need more
// control over error handling or large tokens, or must run sequential scans
// on a reader, should use [bufio.Reader] instead.
type Scanner struct {
r io.Reader // The reader provided by the client.
split SplitFunc // The function to split the tokens.
maxTokenSize int // Maximum size of a token; modified by tests.
token []byte // Last token returned by split.
buf []byte // Buffer used as argument to split.
start int // First non-processed byte in buf.
end int // End of data in buf.
err error // Sticky error.
empties int // Count of successive empty tokens.
scanCalled bool // Scan has been called; buffer is in use.
done bool // Scan has finished.
}
// SplitFunc is the signature of the split function used to tokenize the
// input. The arguments are an initial substring of the remaining unprocessed
// data and a flag, atEOF, that reports whether the [Reader] has no more data
// to give. The return values are the number of bytes to advance the input
// and the next token to return to the user, if any, plus an error, if any.
//
// Scanning stops if the function returns an error, in which case some of
// the input may be discarded. If that error is [ErrFinalToken], scanning
// stops with no error. A non-nil token delivered with [ErrFinalToken]
// will be the last token, and a nil token with [ErrFinalToken]
// immediately stops the scanning.
//
// Otherwise, the [Scanner] advances the input. If the token is not nil,
// the [Scanner] returns it to the user. If the token is nil, the
// Scanner reads more data and continues scanning; if there is no more
// data--if atEOF was true--the [Scanner] returns. If the data does not
// yet hold a complete token, for instance if it has no newline while
// scanning lines, a [SplitFunc] can return (0, nil, nil) to signal the
// [Scanner] to read more data into the slice and try again with a
// longer slice starting at the same point in the input.
//
// The function is never called with an empty data slice unless atEOF
// is true. If atEOF is true, however, data may be non-empty and,
// as always, holds unprocessed text.
type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error)
// Errors returned by Scanner.
var (
ErrTooLong = errors.New("bufio.Scanner: token too long")
ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count")
ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input")
ErrBadReadCount = errors.New("bufio.Scanner: Read returned impossible count")
)
const (
// MaxScanTokenSize is the maximum size used to buffer a token
// unless the user provides an explicit buffer with [Scanner.Buffer].
// The actual maximum token size may be smaller as the buffer
// may need to include, for instance, a newline.
MaxScanTokenSize = 64 * 1024
startBufSize = 4096 // Size of initial allocation for buffer.
)
// NewScanner returns a new [Scanner] to read from r.
// The split function defaults to [ScanLines].
func ( io.Reader) *Scanner {
return &Scanner{
r: ,
split: ScanLines,
maxTokenSize: MaxScanTokenSize,
}
}
// Err returns the first non-EOF error that was encountered by the [Scanner].
func ( *Scanner) () error {
if .err == io.EOF {
return nil
}
return .err
}
// Bytes returns the most recent token generated by a call to [Scanner.Scan].
// The underlying array may point to data that will be overwritten
// by a subsequent call to Scan. It does no allocation.
func ( *Scanner) () []byte {
return .token
}
// Text returns the most recent token generated by a call to [Scanner.Scan]
// as a newly allocated string holding its bytes.
func ( *Scanner) () string {
return string(.token)
}
// ErrFinalToken is a special sentinel error value. It is intended to be
// returned by a Split function to indicate that the scanning should stop
// with no error. If the token being delivered with this error is not nil,
// the token is the last token.
//
// The value is useful to stop processing early or when it is necessary to
// deliver a final empty token (which is different from a nil token).
// One could achieve the same behavior with a custom error value but
// providing one here is tidier.
// See the emptyFinalToken example for a use of this value.
var ErrFinalToken = errors.New("final token")
// Scan advances the [Scanner] to the next token, which will then be
// available through the [Scanner.Bytes] or [Scanner.Text] method. It returns false when
// there are no more tokens, either by reaching the end of the input or an error.
// After Scan returns false, the [Scanner.Err] method will return any error that
// occurred during scanning, except that if it was [io.EOF], [Scanner.Err]
// will return nil.
// Scan panics if the split function returns too many empty
// tokens without advancing the input. This is a common error mode for
// scanners.
func ( *Scanner) () bool {
if .done {
return false
}
.scanCalled = true
// Loop until we have a token.
for {
// See if we can get a token with what we already have.
// If we've run out of data but have an error, give the split function
// a chance to recover any remaining, possibly empty token.
if .end > .start || .err != nil {
, , := .split(.buf[.start:.end], .err != nil)
if != nil {
if == ErrFinalToken {
.token =
.done = true
// When token is not nil, it means the scanning stops
// with a trailing token, and thus the return value
// should be true to indicate the existence of the token.
return != nil
}
.setErr()
return false
}
if !.advance() {
return false
}
.token =
if != nil {
if .err == nil || > 0 {
.empties = 0
} else {
// Returning tokens not advancing input at EOF.
.empties++
if .empties > maxConsecutiveEmptyReads {
panic("bufio.Scan: too many empty tokens without progressing")
}
}
return true
}
}
// We cannot generate a token with what we are holding.
// If we've already hit EOF or an I/O error, we are done.
if .err != nil {
// Shut it down.
.start = 0
.end = 0
return false
}
// Must read more data.
// First, shift data to beginning of buffer if there's lots of empty space
// or space is needed.
if .start > 0 && (.end == len(.buf) || .start > len(.buf)/2) {
copy(.buf, .buf[.start:.end])
.end -= .start
.start = 0
}
// Is the buffer full? If so, resize.
if .end == len(.buf) {
// Guarantee no overflow in the multiplication below.
const = int(^uint(0) >> 1)
if len(.buf) >= .maxTokenSize || len(.buf) > /2 {
.setErr(ErrTooLong)
return false
}
:= len(.buf) * 2
if == 0 {
= startBufSize
}
= min(, .maxTokenSize)
:= make([]byte, )
copy(, .buf[.start:.end])
.buf =
.end -= .start
.start = 0
}
// Finally we can read some input. Make sure we don't get stuck with
// a misbehaving Reader. Officially we don't need to do this, but let's
// be extra careful: Scanner is for safe, simple jobs.
for := 0; ; {
, := .r.Read(.buf[.end:len(.buf)])
if < 0 || len(.buf)-.end < {
.setErr(ErrBadReadCount)
break
}
.end +=
if != nil {
.setErr()
break
}
if > 0 {
.empties = 0
break
}
++
if > maxConsecutiveEmptyReads {
.setErr(io.ErrNoProgress)
break
}
}
}
}
// advance consumes n bytes of the buffer. It reports whether the advance was legal.
func ( *Scanner) ( int) bool {
if < 0 {
.setErr(ErrNegativeAdvance)
return false
}
if > .end-.start {
.setErr(ErrAdvanceTooFar)
return false
}
.start +=
return true
}
// setErr records the first error encountered.
func ( *Scanner) ( error) {
if .err == nil || .err == io.EOF {
.err =
}
}
// Buffer sets the initial buffer to use when scanning
// and the maximum size of buffer that may be allocated during scanning.
// The maximum token size must be less than the larger of max and cap(buf).
// If max <= cap(buf), [Scanner.Scan] will use this buffer only and do no allocation.
//
// By default, [Scanner.Scan] uses an internal buffer and sets the
// maximum token size to [MaxScanTokenSize].
//
// Buffer panics if it is called after scanning has started.
func ( *Scanner) ( []byte, int) {
if .scanCalled {
panic("Buffer called after Scan")
}
.buf = [0:cap()]
.maxTokenSize =
}
// Split sets the split function for the [Scanner].
// The default split function is [ScanLines].
//
// Split panics if it is called after scanning has started.
func ( *Scanner) ( SplitFunc) {
if .scanCalled {
panic("Split called after Scan")
}
.split =
}
// Split functions
// ScanBytes is a split function for a [Scanner] that returns each byte as a token.
func ( []byte, bool) ( int, []byte, error) {
if && len() == 0 {
return 0, nil, nil
}
return 1, [0:1], nil
}
var errorRune = []byte(string(utf8.RuneError))
// ScanRunes is a split function for a [Scanner] that returns each
// UTF-8-encoded rune as a token. The sequence of runes returned is
// equivalent to that from a range loop over the input as a string, which
// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".
// Because of the Scan interface, this makes it impossible for the client to
// distinguish correctly encoded replacement runes from encoding errors.
func ( []byte, bool) ( int, []byte, error) {
if && len() == 0 {
return 0, nil, nil
}
// Fast path 1: ASCII.
if [0] < utf8.RuneSelf {
return 1, [0:1], nil
}
// Fast path 2: Correct UTF-8 decode without error.
, := utf8.DecodeRune()
if > 1 {
// It's a valid encoding. Width cannot be one for a correctly encoded
// non-ASCII rune.
return , [0:], nil
}
// We know it's an error: we have width==1 and implicitly r==utf8.RuneError.
// Is the error because there wasn't a full rune to be decoded?
// FullRune distinguishes correctly between erroneous and incomplete encodings.
if ! && !utf8.FullRune() {
// Incomplete; get more bytes.
return 0, nil, nil
}
// We have a real UTF-8 encoding error. Return a properly encoded error rune
// but advance only one byte. This matches the behavior of a range loop over
// an incorrectly encoded string.
return 1, errorRune, nil
}
// dropCR drops a terminal \r from the data.
func dropCR( []byte) []byte {
if len() > 0 && [len()-1] == '\r' {
return [0 : len()-1]
}
return
}
// ScanLines is a split function for a [Scanner] that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ( []byte, bool) ( int, []byte, error) {
if && len() == 0 {
return 0, nil, nil
}
if := bytes.IndexByte(, '\n'); >= 0 {
// We have a full newline-terminated line.
return + 1, dropCR([0:]), nil
}
// If we're at EOF, we have a final, non-terminated line. Return it.
if {
return len(), dropCR(), nil
}
// Request more data.
return 0, nil, nil
}
// isSpace reports whether the character is a Unicode white space character.
// We avoid dependency on the unicode package, but check validity of the implementation
// in the tests.
func isSpace( rune) bool {
if <= '\u00FF' {
// Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs.
switch {
case ' ', '\t', '\n', '\v', '\f', '\r':
return true
case '\u0085', '\u00A0':
return true
}
return false
}
// High-valued ones.
if '\u2000' <= && <= '\u200a' {
return true
}
switch {
case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
return true
}
return false
}
// ScanWords is a split function for a [Scanner] that returns each
// space-separated word of text, with surrounding spaces deleted. It will
// never return an empty string. The definition of space is set by
// unicode.IsSpace.
func ( []byte, bool) ( int, []byte, error) {
// Skip leading spaces.
:= 0
for := 0; < len(); += {
var rune
, = utf8.DecodeRune([:])
if !isSpace() {
break
}
}
// Scan until space, marking end of word.
for , := 0, ; < len(); += {
var rune
, = utf8.DecodeRune([:])
if isSpace() {
return + , [:], nil
}
}
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
if && len() > {
return len(), [:], nil
}
// Request more data.
return , nil, nil
}
The pages are generated with Golds v0.7.0-preview. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |