2025-10-15 10:12:44 +03:00

241 lines
5.6 KiB
Go

// Package stringutil Exports common rune utilities for parsing and emitting javascript
package stringutil
import (
"net/url"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
func IsWhiteSpaceLike(ch rune) bool {
return IsWhiteSpaceSingleLine(ch) || IsLineBreak(ch)
}
func IsWhiteSpaceSingleLine(ch rune) bool {
// Note: nextLine is in the Zs space, and should be considered to be a whitespace.
// It is explicitly not a line-break as it isn't in the exact set specified by EcmaScript.
switch ch {
case
' ', // space
'\t', // tab
'\v', // verticalTab
'\f', // formFeed
0x0085, // nextLine
0x00A0, // nonBreakingSpace
0x1680, // ogham
0x2000, // enQuad
0x2001, // emQuad
0x2002, // enSpace
0x2003, // emSpace
0x2004, // threePerEmSpace
0x2005, // fourPerEmSpace
0x2006, // sixPerEmSpace
0x2007, // figureSpace
0x2008, // punctuationEmSpace
0x2009, // thinSpace
0x200A, // hairSpace
0x200B, // zeroWidthSpace
0x202F, // narrowNoBreakSpace
0x205F, // mathematicalSpace
0x3000, // ideographicSpace
0xFEFF: // byteOrderMark
return true
}
return false
}
func IsLineBreak(ch rune) bool {
// ES5 7.3:
// The ECMAScript line terminator characters are listed in Table 3.
// Table 3: Line Terminator Characters
// Code Unit Value Name Formal Name
// \u000A Line Feed <LF>
// \u000D Carriage Return <CR>
// \u2028 Line separator <LS>
// \u2029 Paragraph separator <PS>
// Only the characters in Table 3 are treated as line terminators. Other new line or line
// breaking characters are treated as white space but not as line terminators.
switch ch {
case
'\n', // lineFeed
'\r', // carriageReturn
0x2028, // lineSeparator
0x2029: // paragraphSeparator
return true
}
return false
}
func IsDigit(ch rune) bool {
return ch >= '0' && ch <= '9'
}
func IsOctalDigit(ch rune) bool {
return ch >= '0' && ch <= '7'
}
func IsHexDigit(ch rune) bool {
return ch >= '0' && ch <= '9' || ch >= 'A' && ch <= 'F' || ch >= 'a' && ch <= 'f'
}
func IsASCIILetter(ch rune) bool {
return ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z'
}
func SplitLines(text string) []string {
lines := make([]string, 0, strings.Count(text, "\n")+1) // preallocate
start := 0
pos := 0
for pos < len(text) {
switch text[pos] {
case '\r':
if pos+1 < len(text) && text[pos+1] == '\n' {
lines = append(lines, text[start:pos])
pos += 2
start = pos
continue
}
fallthrough
case '\n':
lines = append(lines, text[start:pos])
pos++
start = pos
continue
}
pos++
}
if start < len(text) {
lines = append(lines, text[start:])
}
return lines
}
func GuessIndentation(lines []string) int {
const MAX_SMI_X86 int = 0x3fff_ffff
indentation := MAX_SMI_X86
for _, line := range lines {
if len(line) == 0 {
continue
}
i := 0
for i < len(line) && i < indentation {
ch, size := utf8.DecodeRuneInString(line[i:])
if !IsWhiteSpaceLike(ch) {
break
}
i += size
}
if i < indentation {
indentation = i
}
if indentation == 0 {
return 0
}
}
if indentation == MAX_SMI_X86 {
return 0
}
return indentation
}
// https://tc39.es/ecma262/multipage/global-object.html#sec-encodeuri-uri
func EncodeURI(s string) string {
var builder strings.Builder
start := 0
pos := indexAny(s, ";/?:@&=+$,#", 0)
for pos >= 0 {
builder.WriteString(url.QueryEscape(s[start:pos]))
builder.WriteString(s[pos : pos+1])
start = pos + 1
pos = indexAny(s, ";/?:@&=+$,#", start)
}
if start < len(s) {
builder.WriteString(url.QueryEscape(s[start:]))
}
return builder.String()
}
func indexAny(s, chars string, start int) int {
if start < 0 || start >= len(s) {
return -1
}
index := strings.IndexAny(s[start:], chars)
if index < 0 {
return -1
}
return start + index
}
func getByteOrderMarkLength(text string) int {
if len(text) >= 1 {
ch0 := text[0]
if ch0 == 0xfe {
if len(text) >= 2 && text[1] == 0xff {
return 2 // utf16be
}
return 0
}
if ch0 == 0xff {
if len(text) >= 2 && text[1] == 0xfe {
return 2 // utf16le
}
return 0
}
if ch0 == 0xef {
if len(text) >= 3 && text[1] == 0xbb && text[2] == 0xbf {
return 3 // utf8
}
return 0
}
}
return 0
}
func RemoveByteOrderMark(text string) string {
length := getByteOrderMarkLength(text)
if length > 0 {
return text[length:]
}
return text
}
func AddUTF8ByteOrderMark(text string) string {
if getByteOrderMarkLength(text) == 0 {
return "\xEF\xBB\xBF" + text
}
return text
}
func StripQuotes(name string) string {
firstChar, _ := utf8.DecodeRuneInString(name)
lastChar, _ := utf8.DecodeLastRuneInString(name)
if firstChar == lastChar && (firstChar == '\'' || firstChar == '"' || firstChar == '`') {
return name[1 : len(name)-1]
}
return name
}
var matchSlashSomething = regexp.MustCompile(`\.`)
func matchSlashReplacer(in string) string {
return in[1:]
}
func UnquoteString(str string) string {
// strconv.Unquote is insufficient as that only handles a single character inside single quotes, as those are character literals in go
inner := StripQuotes(str)
// In strada we do str.replace(/\\./g, s => s.substring(1)) - which is to say, replace all backslash-something with just something
// That's replicated here faithfully, but it seems wrong! This should probably be an actual unquote operation?
return matchSlashSomething.ReplaceAllStringFunc(inner, matchSlashReplacer)
}
func LowerFirstChar(str string) string {
char, size := utf8.DecodeRuneInString(str)
if size > 0 {
return string(unicode.ToLower(char)) + str[size:]
}
return str
}