Update all dependencies

This commit is contained in:
Gabriel Adrian Samfira 2026-02-10 17:54:12 +02:00 committed by Gabriel
parent 88b832172e
commit d344396706
51 changed files with 8645 additions and 60261 deletions

View file

@ -1,2 +0,0 @@
.DS_Store
*.test

View file

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2025 Matt Sherman
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -1,64 +0,0 @@
# stringish
A small Go module that provides a generic type constraint for “string-like”
data, and a utf8 package that works with both strings and byte slices
without conversions.
```go
type Interface interface {
~[]byte | ~string
}
```
[![Go Reference](https://pkg.go.dev/badge/github.com/clipperhouse/stringish/utf8.svg)](https://pkg.go.dev/github.com/clipperhouse/stringish/utf8)
[![Test Status](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml/badge.svg)](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml)
## Install
```
go get github.com/clipperhouse/stringish
```
## Examples
```go
import (
"github.com/clipperhouse/stringish"
"github.com/clipperhouse/stringish/utf8"
)
s := "Hello, 世界"
r, size := utf8.DecodeRune(s) // not DecodeRuneInString 🎉
b := []byte("Hello, 世界")
r, size = utf8.DecodeRune(b) // same API!
func MyFoo[T stringish.Interface](s T) T {
// pass a string or a []byte
// iterate, slice, transform, whatever
}
```
## Motivation
Sometimes we want APIs to accept `string` or `[]byte` without having to convert
between those types. That conversion usually allocates!
By implementing with `stringish.Interface`, we can have a single API, and
single implementation for both types: one `Foo` instead of `Foo` and
`FooString`.
We have converted the
[`unicode/utf8` package](https://github.com/clipperhouse/stringish/blob/main/utf8/utf8.go)
as an example -- note the absence of`*InString` funcs. We might look at `x/text`
next.
## Used by
- clipperhouse/uax29: [stringish trie](https://github.com/clipperhouse/uax29/blob/master/graphemes/trie.go#L27), [stringish iterator](https://github.com/clipperhouse/uax29/blob/master/internal/iterators/iterator.go#L9), [stringish SplitFunc](https://github.com/clipperhouse/uax29/blob/master/graphemes/splitfunc.go#L21)
- [clipperhouse/displaywidth](https://github.com/clipperhouse/displaywidth)
## Prior discussion
- [Consideration of similar by the Go team](https://github.com/golang/go/issues/48643)

View file

@ -1,5 +0,0 @@
package stringish
type Interface interface {
~[]byte | ~string
}

View file

@ -76,15 +76,17 @@ for tokens.Next() { // Next() returns true until end of data
### Benchmarks
On a Mac M2 laptop, we see around 200MB/s, or around 100 million graphemes per second, and no allocations.
```
goos: darwin
goarch: arm64
pkg: github.com/clipperhouse/uax29/graphemes/comparative
cpu: Apple M2
BenchmarkGraphemes/clipperhouse/uax29-8 171895 ns/op 203.39 MB/s 0 B/op 0 allocs/op
BenchmarkGraphemes/rivo/uniseg-8 1980475 ns/op 17.65 MB/s 0 B/op 0 allocs/op
BenchmarkGraphemesMixed/clipperhouse/uax29-8 142635 ns/op 245.12 MB/s 0 B/op 0 allocs/op
BenchmarkGraphemesMixed/rivo/uniseg-8 2018284 ns/op 17.32 MB/s 0 B/op 0 allocs/op
BenchmarkGraphemesASCII/clipperhouse/uax29-8 8846 ns/op 508.73 MB/s 0 B/op 0 allocs/op
BenchmarkGraphemesASCII/rivo/uniseg-8 366760 ns/op 12.27 MB/s 0 B/op 0 allocs/op
```
### Invalid inputs

View file

@ -0,0 +1,119 @@
package graphemes
// ansiEscapeLength returns the byte length of a valid ANSI escape sequence at the
// start of data, or 0 if none. Input is UTF-8; only 7-bit ESC sequences are
// recognized (C1 0x800x9F can be UTF-8 continuation bytes).
//
// Recognized forms (ECMA-48 / ISO 6429):
// - CSI: ESC [ then parameter bytes (0x300x3F), intermediate (0x200x2F), final (0x400x7E)
// - OSC: ESC ] then payload until ST (ESC \) or BEL (0x07)
// - DCS, SOS, PM, APC: ESC P / X / ^ / _ then payload until ST (ESC \)
// - Two-byte: ESC + Fe (0x400x5F excluding above), or Fp (0x300x3F), or nF (0x200x2F then final)
func ansiEscapeLength[T ~string | ~[]byte](data T) int {
n := len(data)
if n < 2 {
return 0
}
if data[0] != esc {
return 0
}
b1 := data[1]
switch b1 {
case '[': // CSI
body := csiLength(data[2:])
if body == 0 {
return 0
}
return 2 + body
case ']': // OSC allows BEL or ST as terminator
body := oscLength(data[2:])
if body == 0 {
return 0
}
return 2 + body
case 'P', 'X', '^', '_': // DCS, SOS, PM, APC require ST (ESC \) only
body := stSequenceLength(data[2:])
if body == 0 {
return 0
}
return 2 + body
}
if b1 >= 0x40 && b1 <= 0x5F {
// Fe (C1) two-byte; [ ] P X ^ _ handled above
return 2
}
if b1 >= 0x30 && b1 <= 0x3F {
// Fp (private) two-byte
return 2
}
if b1 >= 0x20 && b1 <= 0x2F {
// nF: intermediates then one final (0x300x7E)
i := 2
for i < n && data[i] >= 0x20 && data[i] <= 0x2F {
i++
}
if i < n && data[i] >= 0x30 && data[i] <= 0x7E {
return i + 1
}
return 0
}
return 0
}
// csiLength returns the length of the CSI body (param/intermediate/final bytes).
// data is the slice after "ESC [".
// Per ECMA-48, the CSI body has the form:
//
// parameters (0x300x3F)*, intermediates (0x200x2F)*, final (0x400x7E)
//
// Once an intermediate byte is seen, subsequent parameter bytes are invalid.
func csiLength[T ~string | ~[]byte](data T) int {
seenIntermediate := false
for i := 0; i < len(data); i++ {
b := data[i]
if b >= 0x30 && b <= 0x3F {
if seenIntermediate {
return 0
}
continue
}
if b >= 0x20 && b <= 0x2F {
seenIntermediate = true
continue
}
if b >= 0x40 && b <= 0x7E {
return i + 1
}
return 0
}
return 0
}
// oscLength returns the length of the OSC body up to and including
// the terminator. OSC accepts either BEL (0x07) or ST (ESC \) per
// widespread terminal convention. data is the slice after "ESC ]".
func oscLength[T ~string | ~[]byte](data T) int {
for i := 0; i < len(data); i++ {
b := data[i]
if b == bel {
return i + 1
}
if b == esc && i+1 < len(data) && data[i+1] == '\\' {
return i + 2
}
}
return 0
}
// stSequenceLength returns the length of a control-string body up to and
// including the ST (ESC \) terminator. Used for DCS, SOS, PM, and APC, which
// per ECMA-48 require ST and do not accept BEL. data is the slice after "ESC x".
func stSequenceLength[T ~string | ~[]byte](data T) int {
for i := 0; i < len(data); i++ {
if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' {
return i + 2
}
}
return 0
}

View file

@ -1,5 +1,7 @@
package graphemes
import "unicode/utf8"
// FromString returns an iterator for the grapheme clusters in the input string.
// Iterate while Next() is true, and access the grapheme via Value().
func FromString(s string) *Iterator[string] {
@ -25,6 +27,9 @@ type Iterator[T ~string | ~[]byte] struct {
data T
pos int
start int
// AnsiEscapeSequences treats ANSI escape sequences (ECMA-48) as single grapheme
// clusters when true. Default is false.
AnsiEscapeSequences bool
}
var (
@ -32,6 +37,12 @@ var (
splitFuncBytes = splitFunc[[]byte]
)
const (
esc = 0x1B
cr = 0x0D
bel = 0x07
)
// Next advances the iterator to the next grapheme cluster.
// Returns false when there are no more grapheme clusters.
func (iter *Iterator[T]) Next() bool {
@ -40,12 +51,18 @@ func (iter *Iterator[T]) Next() bool {
}
iter.start = iter.pos
// ASCII hot path: if current byte is printable ASCII and
// next byte is also ASCII (or end of data), return single byte
if iter.AnsiEscapeSequences && iter.data[iter.pos] == esc {
if a := ansiEscapeLength(iter.data[iter.pos:]); a > 0 {
iter.pos += a
return true
}
}
// ASCII hot path: any ASCII is one grapheme when next byte is ASCII or end.
// Fall through on CR so splitfunc can handle CR+LF as a single cluster.
b := iter.data[iter.pos]
if b >= 0x20 && b < 0x7F {
// If next byte is non-ASCII, it could be a combining mark
if iter.pos+1 >= len(iter.data) || iter.data[iter.pos+1] < 0x80 {
if b < utf8.RuneSelf && b != cr {
if iter.pos+1 >= len(iter.data) || iter.data[iter.pos+1] < utf8.RuneSelf {
iter.pos++
return true
}

View file

@ -2,8 +2,6 @@ package graphemes
import (
"bufio"
"github.com/clipperhouse/stringish"
)
// is determines if lookup intersects propert(ies)
@ -28,7 +26,7 @@ const (
// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
func splitFunc[T ~string | ~[]byte](data T, atEOF bool) (advance int, token T, err error) {
var empty T
if len(data) == 0 {
return 0, empty, nil

View file

@ -3,8 +3,6 @@ package graphemes
// generated by github.com/clipperhouse/uax29/v2
// from https://www.unicode.org/Public/17.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
import "github.com/clipperhouse/stringish"
type property uint32
const (
@ -30,7 +28,7 @@ const (
// lookup returns the trie value for the first UTF-8 encoding in s and
// the width in bytes of this encoding. The size will be 0 if s does not
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
func lookup[T stringish.Interface](s T) (v property, sz int) {
func lookup[T ~string | ~[]byte](s T) (v property, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII