-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathheuristics.go
206 lines (184 loc) · 5.65 KB
/
heuristics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// Copyright ©2022 Dan Kortschak. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"fmt"
"go/scanner"
"go/token"
"regexp"
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// heuristic is a type that can give suggest whether a word is acceptable.
type heuristic interface {
// isAcceptable returns whether the provided word is acceptable. If
// partial is true, the word is a portion of a whole word that has
// been split.
isAcceptable(word string, partial bool) bool
}
// wordLen is a word length heuristic.
type wordLen struct {
max int
}
// isAcceptable returns whether the query word is over the maximum word
// length to consider.
func (h wordLen) isAcceptable(word string, _ bool) bool {
return h.max > 0 && len(word) > h.max
}
// allUpper is a heuristic that accepts all-uppercase words.
type allUpper struct{}
// isAcceptable returns whether all runes in word are uppercase. For the
// purposes of this test, numerals and underscores are considered uppercase.
// As a special case, a final 's' is also considered uppercase to allow
// plurals of initialisms and acronyms.
func (allUpper) isAcceptable(word string, _ bool) bool {
word = strings.TrimSuffix(word, "s")
for _, r := range word {
if !unicode.IsUpper(r) && !unicode.IsDigit(r) && r != '_' {
return false
}
}
return true
}
// isSingle is a heuristic that accepts single-rune words.
type isSingle struct{}
// isAcceptable returns whether the query word is a single rune.
func (isSingle) isAcceptable(word string, _ bool) bool {
return utf8.RuneCountInString(word) == 1
}
// isNakedHex is a heuristic that accepts hex numbers as valid words
type isNakedHex struct {
// minLen is a minimum length that will be accepted. This
// prevents accidental acceptance of short misspelled words
// with only hex digits.
minLen int
}
// isAcceptable returns whether the query word is a hex number.
func (h isNakedHex) isAcceptable(word string, _ bool) bool {
return h.minLen != 0 && len(word) >= h.minLen && isHex(word)
}
// isNumber is a heuristic that accepts all Go syntax numbers as
// valid words.
type isNumber struct {
scanner scanner.Scanner
}
// isAcceptable abuses the go/scanner to check whether word is a number.
func (h *isNumber) isAcceptable(word string, _ bool) bool {
var errored bool
eh := func(_ token.Position, _ string) {
errored = true
}
fset := token.NewFileSet()
h.scanner.Init(fset.AddFile("", fset.Base(), len(word)), []byte(word), eh, 0)
_, tok, lit := h.scanner.Scan()
return !errored && lit == word && (tok == token.INT || tok == token.FLOAT || tok == token.IMAG)
}
// isHexRune is a heuristic that accepts Go rune literal syntax as a valid
// word.
type isHexRune struct{}
// isAcceptable returns whether word can be interpreted and a \x, \u, \U or
// \xxx octal rune literal.
func (isHexRune) isAcceptable(word string, _ bool) bool {
if len(word) < 4 || word[0] != '\\' {
return false
}
switch word[1] {
case 'x':
return len(word) == 4 && isHex(word[2:4])
case 'u':
return len(word) == 6 && isHex(word[2:6])
case 'U':
return len(word) == 10 && isHex(word[2:10])
default:
if len(word) == 4 {
return false
}
for _, c := range word[1:] {
if c < '0' || '7' < c {
return false
}
}
return true
}
}
// isUnit is a heuristic that accepts quantities with units as valid words.
type isUnit struct{}
// isAcceptable returns whether word is a quantity with a unit. Naked
// units are handled by hunspell. If partial is true, word is not a valid
// unit as it would have been directly adjacent to other characters.
func (isUnit) isAcceptable(word string, partial bool) bool {
if partial {
// Don't consider camel split words for unit heuristic.
return false
}
for _, u := range knownUnits {
if strings.HasSuffix(word, u) {
_, err := strconv.ParseFloat(strings.TrimSuffix(word, u), 64)
if err == nil {
// We have to check all of them until we get an
// acceptance unless we guarantee that no suffix
// of a unit exists that is also a unit later in
// the list. If performance becomes an issue do
// this.
return true
}
}
}
return false
}
// knownUnits is the set of units we check for. Add more as they are
// identified as problems.
var knownUnits = []string{
"k", "M", "x",
"Kb", "kb", "Mb", "Gb", "Tb",
"KB", "kB", "MB", "GB", "TB",
"Kib", "kib", "Mib", "Gib", "Tib",
"KiB", "kiB", "MiB", "GiB", "TiB",
"Å", "nm", "µm", "mm", "cm", "m", "km",
"ns", "µs", "us", "ms", "s", "min", "hr",
"Hz",
"am", "pm",
}
// patterns is a heuristic based on user-provided regular expressions.
type patterns []*regexp.Regexp
// newPatterns returns a new patterns compiled from the provided
// expressions.
func newPatterns(exprs []string) (patterns, error) {
p := make([]*regexp.Regexp, len(exprs))
var err error
for i, re := range exprs {
p[i], err = regexp.Compile(re)
if err != nil {
return nil, fmt.Errorf("could not construct pattern heuristic: %w", err)
}
}
return p, nil
}
// isAcceptable returns whether word matches any of the regular expressions
// in the patterns heuristic. If partial is true no regexp is tried and
// false is returned. If partial matches are required, they should be
// encoded into the patterns.
func (h patterns) isAcceptable(word string, partial bool) bool {
if partial {
return false
}
for _, p := range h {
if p.MatchString(word) {
return true
}
}
return false
}
// isHex returns whether all bytes of s are hex digits.
func isHex(s string) bool {
for _, b := range s {
b |= 'a' - 'A' // Lower case in the relevant range.
if (b < '0' || '9' < b) && (b < 'a' || 'f' < b) {
return false
}
}
return true
}