Updated external dependencies.

This commit is contained in:
Martin Dosch 2019-05-31 09:10:48 +02:00
parent d025df0507
commit 86a5c5f610
28 changed files with 416 additions and 4428 deletions

View file

@ -6,9 +6,9 @@ import (
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed/extensions"
ext "github.com/mmcdole/gofeed/extensions"
"github.com/mmcdole/gofeed/internal/shared"
"github.com/mmcdole/goxpp"
xpp "github.com/mmcdole/goxpp"
)
var (
@ -681,10 +681,8 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
lowerType := strings.ToLower(text.Type)
lowerMode := strings.ToLower(text.Mode)
if strings.HasPrefix(result, "<![CDATA[") &&
strings.HasSuffix(result, "]]>") {
result = strings.TrimPrefix(result, "<![CDATA[")
result = strings.TrimSuffix(result, "]]>")
if strings.Contains(result, "<![CDATA[") {
result = shared.StripCDATA(result)
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
result, _ = ap.base.ResolveHTML(result)
}

View file

@ -5,7 +5,7 @@ import (
"strings"
"github.com/mmcdole/gofeed/internal/shared"
"github.com/mmcdole/goxpp"
xpp "github.com/mmcdole/goxpp"
)
// FeedType represents one of the possible feed

View file

@ -8,7 +8,7 @@ import (
"strconv"
"strings"
"github.com/mmcdole/goxpp"
xpp "github.com/mmcdole/goxpp"
)
var (
@ -21,6 +21,9 @@ var (
InvalidNumericReference = errors.New("invalid numeric reference")
)
const CDATA_START = "<![CDATA["
const CDATA_END = "]]>"
// ParseText is a helper function for parsing the text
// from the current element of the XMLPullParser.
// This function can handle parsing naked XML text from
@ -39,16 +42,46 @@ func ParseText(p *xpp.XMLPullParser) (string, error) {
result := text.InnerXML
result = strings.TrimSpace(result)
if strings.HasPrefix(result, "<![CDATA[") &&
strings.HasSuffix(result, "]]>") {
result = strings.TrimPrefix(result, "<![CDATA[")
result = strings.TrimSuffix(result, "]]>")
return result, nil
if strings.Contains(result, CDATA_START) {
return StripCDATA(result), nil
}
return DecodeEntities(result)
}
// StripCDATA removes CDATA tags from the string
// content outside of CDATA tags is passed via DecodeEntities
func StripCDATA(str string) string {
buf := bytes.NewBuffer([]byte{})
curr := 0
for curr < len(str) {
start := indexAt(str, CDATA_START, curr)
if start == -1 {
dec, _ := DecodeEntities(str[curr:])
buf.Write([]byte(dec))
return buf.String()
}
end := indexAt(str, CDATA_END, start)
if end == -1 {
dec, _ := DecodeEntities(str[curr:])
buf.Write([]byte(dec))
return buf.String()
}
buf.Write([]byte(str[start+len(CDATA_START) : end]))
curr = curr + end + len(CDATA_END)
}
return buf.String()
}
// DecodeEntities decodes escaped XML entities
// in a string and returns the unescaped string
func DecodeEntities(str string) (string, error) {
@ -151,3 +184,11 @@ func ParseNameAddress(nameAddressText string) (name string, address string) {
}
return
}
func indexAt(str, substr string, start int) int {
idx := strings.Index(str[start:], substr)
if idx > -1 {
idx += start
}
return idx
}

View file

@ -5,9 +5,9 @@ import (
"io"
"strings"
"github.com/mmcdole/gofeed/extensions"
ext "github.com/mmcdole/gofeed/extensions"
"github.com/mmcdole/gofeed/internal/shared"
"github.com/mmcdole/goxpp"
xpp "github.com/mmcdole/goxpp"
)
// Parser is a RSS Parser

View file

@ -6,7 +6,7 @@ import (
"time"
"github.com/mmcdole/gofeed/atom"
"github.com/mmcdole/gofeed/extensions"
ext "github.com/mmcdole/gofeed/extensions"
"github.com/mmcdole/gofeed/internal/shared"
"github.com/mmcdole/gofeed/rss"
)

View file

@ -78,14 +78,14 @@ table.Render()
```
DATE | DESCRIPTION | CV2 | AMOUNT
+----------+--------------------------+-------+---------+
-----------+--------------------------+-------+----------
1/1/2014 | Domain name | 2233 | $10.98
1/1/2014 | January Hosting | 2233 | $54.95
1/4/2014 | February Hosting | 2233 | $51.00
1/4/2014 | February Extra Bandwidth | 2233 | $30.00
+----------+--------------------------+-------+---------+
-----------+--------------------------+-------+----------
TOTAL | $146 93
+-------+---------+
--------+----------
```
@ -267,6 +267,34 @@ Note: Caption text will wrap with total width of rendered table.
Movie ratings.
```
#### Render table into a string
Instead of rendering the table to `io.Stdout` you can also render it into a string. Go 1.10 introduced the `strings.Builder` type which implements the `io.Writer` interface and can therefore be used for this task. Example:
```go
package main
import (
"strings"
"fmt"
"github.com/olekukonko/tablewriter"
)
func main() {
tableString := &strings.Builder{}
table := tablewriter.NewWriter(tableString)
/*
* Code to fill the table
*/
table.Render()
fmt.Println(tableString.String())
}
```
#### TODO
- ~~Import Directly from CSV~~ - `done`
- ~~Support for `SetFooter`~~ - `done`

View file

@ -319,16 +319,29 @@ func (t *Table) ClearFooter() {
t.footers = [][]string{}
}
// Center based on position and border.
func (t *Table) center(i int) string {
if i == -1 && !t.borders.Left {
return t.pRow
}
if i == len(t.cs)-1 && !t.borders.Right {
return t.pRow
}
return t.pCenter
}
// Print line based on row width
func (t *Table) printLine(nl bool) {
fmt.Fprint(t.out, t.pCenter)
fmt.Fprint(t.out, t.center(-1))
for i := 0; i < len(t.cs); i++ {
v := t.cs[i]
fmt.Fprintf(t.out, "%s%s%s%s",
t.pRow,
strings.Repeat(string(t.pRow), v),
t.pRow,
t.pCenter)
t.center(i))
}
if nl {
fmt.Fprint(t.out, t.newLine)
@ -517,6 +530,9 @@ func (t *Table) printFooter() {
// Print first junction
if i == 0 {
if length > 0 && !t.borders.Left {
center = t.pRow
}
fmt.Fprint(t.out, center)
}
@ -524,16 +540,27 @@ func (t *Table) printFooter() {
if length == 0 {
pad = SPACE
}
// Ignore left space of it has printed before
// Ignore left space as it has printed before
if hasPrinted || t.borders.Left {
pad = t.pRow
center = t.pCenter
}
// Change Center end position
if center != SPACE {
if i == end && !t.borders.Right {
center = t.pRow
}
}
// Change Center start position
if center == SPACE {
if i < end && len(t.footers[i+1][0]) != 0 {
center = t.pCenter
if !t.borders.Left {
center = t.pRow
} else {
center = t.pCenter
}
}
}

View file

@ -61,7 +61,7 @@ func Title(name string) string {
}
// Pad String
// Attempts to play string in the center
// Attempts to place string in the center
func Pad(s, pad string, width int) string {
gap := width - DisplayWidth(s)
if gap > 0 {
@ -73,7 +73,7 @@ func Pad(s, pad string, width int) string {
}
// Pad String Right position
// This would pace string at the left side fo the screen
// This would place string at the left side of the screen
func PadRight(s, pad string, width int) string {
gap := width - DisplayWidth(s)
if gap > 0 {
@ -83,7 +83,7 @@ func PadRight(s, pad string, width int) string {
}
// Pad String Left position
// This would pace string at the right side fo the screen
// This would place string at the right side of the screen
func PadLeft(s, pad string, width int) string {
gap := width - DisplayWidth(s)
if gap > 0 {

View file

@ -1,712 +0,0 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
//go:generate go run gen.go
//go:generate go run gen.go -test
package main
import (
"bytes"
"flag"
"fmt"
"go/format"
"io/ioutil"
"math/rand"
"os"
"sort"
"strings"
)
// identifier converts s to a Go exported identifier.
// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
func identifier(s string) string {
b := make([]byte, 0, len(s))
cap := true
for _, c := range s {
if c == '-' {
cap = true
continue
}
if cap && 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
cap = false
b = append(b, byte(c))
}
return string(b)
}
var test = flag.Bool("test", false, "generate table_test.go")
func genFile(name string, buf *bytes.Buffer) {
b, err := format.Source(buf.Bytes())
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
if err := ioutil.WriteFile(name, b, 0644); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func main() {
flag.Parse()
var all []string
all = append(all, elements...)
all = append(all, attributes...)
all = append(all, eventHandlers...)
all = append(all, extra...)
sort.Strings(all)
// uniq - lists have dups
w := 0
for _, s := range all {
if w == 0 || all[w-1] != s {
all[w] = s
w++
}
}
all = all[:w]
if *test {
var buf bytes.Buffer
fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n")
fmt.Fprintln(&buf, "package atom\n")
fmt.Fprintln(&buf, "var testAtomList = []string{")
for _, s := range all {
fmt.Fprintf(&buf, "\t%q,\n", s)
}
fmt.Fprintln(&buf, "}")
genFile("table_test.go", &buf)
return
}
// Find hash that minimizes table size.
var best *table
for i := 0; i < 1000000; i++ {
if best != nil && 1<<(best.k-1) < len(all) {
break
}
h := rand.Uint32()
for k := uint(0); k <= 16; k++ {
if best != nil && k >= best.k {
break
}
var t table
if t.init(h, k, all) {
best = &t
break
}
}
}
if best == nil {
fmt.Fprintf(os.Stderr, "failed to construct string table\n")
os.Exit(1)
}
// Lay out strings, using overlaps when possible.
layout := append([]string{}, all...)
// Remove strings that are substrings of other strings
for changed := true; changed; {
changed = false
for i, s := range layout {
if s == "" {
continue
}
for j, t := range layout {
if i != j && t != "" && strings.Contains(s, t) {
changed = true
layout[j] = ""
}
}
}
}
// Join strings where one suffix matches another prefix.
for {
// Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
// maximizing overlap length k.
besti := -1
bestj := -1
bestk := 0
for i, s := range layout {
if s == "" {
continue
}
for j, t := range layout {
if i == j {
continue
}
for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
if s[len(s)-k:] == t[:k] {
besti = i
bestj = j
bestk = k
}
}
}
}
if bestk > 0 {
layout[besti] += layout[bestj][bestk:]
layout[bestj] = ""
continue
}
break
}
text := strings.Join(layout, "")
atom := map[string]uint32{}
for _, s := range all {
off := strings.Index(text, s)
if off < 0 {
panic("lost string " + s)
}
atom[s] = uint32(off<<8 | len(s))
}
var buf bytes.Buffer
// Generate the Go code.
fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
fmt.Fprintln(&buf, "//go:generate go run gen.go\n")
fmt.Fprintln(&buf, "package atom\n\nconst (")
// compute max len
maxLen := 0
for _, s := range all {
if maxLen < len(s) {
maxLen = len(s)
}
fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s])
}
fmt.Fprintln(&buf, ")\n")
fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0)
fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen)
fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k)
for i, s := range best.tab {
if s == "" {
continue
}
fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s)
}
fmt.Fprintf(&buf, "}\n")
datasize := (1 << best.k) * 4
fmt.Fprintln(&buf, "const atomText =")
textsize := len(text)
for len(text) > 60 {
fmt.Fprintf(&buf, "\t%q +\n", text[:60])
text = text[60:]
}
fmt.Fprintf(&buf, "\t%q\n\n", text)
genFile("table.go", &buf)
fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
}
type byLen []string
func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
func (x byLen) Len() int { return len(x) }
// fnv computes the FNV hash with an arbitrary starting value h.
func fnv(h uint32, s string) uint32 {
for i := 0; i < len(s); i++ {
h ^= uint32(s[i])
h *= 16777619
}
return h
}
// A table represents an attempt at constructing the lookup table.
// The lookup table uses cuckoo hashing, meaning that each string
// can be found in one of two positions.
type table struct {
h0 uint32
k uint
mask uint32
tab []string
}
// hash returns the two hashes for s.
func (t *table) hash(s string) (h1, h2 uint32) {
h := fnv(t.h0, s)
h1 = h & t.mask
h2 = (h >> 16) & t.mask
return
}
// init initializes the table with the given parameters.
// h0 is the initial hash value,
// k is the number of bits of hash value to use, and
// x is the list of strings to store in the table.
// init returns false if the table cannot be constructed.
func (t *table) init(h0 uint32, k uint, x []string) bool {
t.h0 = h0
t.k = k
t.tab = make([]string, 1<<k)
t.mask = 1<<k - 1
for _, s := range x {
if !t.insert(s) {
return false
}
}
return true
}
// insert inserts s in the table.
func (t *table) insert(s string) bool {
h1, h2 := t.hash(s)
if t.tab[h1] == "" {
t.tab[h1] = s
return true
}
if t.tab[h2] == "" {
t.tab[h2] = s
return true
}
if t.push(h1, 0) {
t.tab[h1] = s
return true
}
if t.push(h2, 0) {
t.tab[h2] = s
return true
}
return false
}
// push attempts to push aside the entry in slot i.
func (t *table) push(i uint32, depth int) bool {
if depth > len(t.tab) {
return false
}
s := t.tab[i]
h1, h2 := t.hash(s)
j := h1 + h2 - i
if t.tab[j] != "" && !t.push(j, depth+1) {
return false
}
t.tab[j] = s
return true
}
// The lists of element names and attribute keys were taken from
// https://html.spec.whatwg.org/multipage/indices.html#index
// as of the "HTML Living Standard - Last Updated 16 April 2018" version.
// "command", "keygen" and "menuitem" have been removed from the spec,
// but are kept here for backwards compatibility.
var elements = []string{
"a",
"abbr",
"address",
"area",
"article",
"aside",
"audio",
"b",
"base",
"bdi",
"bdo",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"cite",
"code",
"col",
"colgroup",
"command",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"div",
"dl",
"dt",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"main",
"map",
"mark",
"menu",
"menuitem",
"meta",
"meter",
"nav",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"picture",
"pre",
"progress",
"q",
"rp",
"rt",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"slot",
"small",
"source",
"span",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"u",
"ul",
"var",
"video",
"wbr",
}
// https://html.spec.whatwg.org/multipage/indices.html#attributes-3
//
// "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup",
// "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec,
// but are kept here for backwards compatibility.
var attributes = []string{
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"allowfullscreen",
"allowpaymentrequest",
"allowusermedia",
"alt",
"as",
"async",
"autocomplete",
"autofocus",
"autoplay",
"challenge",
"charset",
"checked",
"cite",
"class",
"color",
"cols",
"colspan",
"command",
"content",
"contenteditable",
"contextmenu",
"controls",
"coords",
"crossorigin",
"data",
"datetime",
"default",
"defer",
"dir",
"dirname",
"disabled",
"download",
"draggable",
"dropzone",
"enctype",
"for",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"headers",
"height",
"hidden",
"high",
"href",
"hreflang",
"http-equiv",
"icon",
"id",
"inputmode",
"integrity",
"is",
"ismap",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"keytype",
"kind",
"label",
"lang",
"list",
"loop",
"low",
"manifest",
"max",
"maxlength",
"media",
"mediagroup",
"method",
"min",
"minlength",
"multiple",
"muted",
"name",
"nomodule",
"nonce",
"novalidate",
"open",
"optimum",
"pattern",
"ping",
"placeholder",
"playsinline",
"poster",
"preload",
"radiogroup",
"readonly",
"referrerpolicy",
"rel",
"required",
"reversed",
"rows",
"rowspan",
"sandbox",
"spellcheck",
"scope",
"scoped",
"seamless",
"selected",
"shape",
"size",
"sizes",
"sortable",
"sorted",
"slot",
"span",
"spellcheck",
"src",
"srcdoc",
"srclang",
"srcset",
"start",
"step",
"style",
"tabindex",
"target",
"title",
"translate",
"type",
"typemustmatch",
"updateviacache",
"usemap",
"value",
"width",
"workertype",
"wrap",
}
// "onautocomplete", "onautocompleteerror", "onmousewheel",
// "onshow" and "onsort" have been removed from the spec,
// but are kept here for backwards compatibility.
var eventHandlers = []string{
"onabort",
"onautocomplete",
"onautocompleteerror",
"onauxclick",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onblur",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncopy",
"oncuechange",
"oncut",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragexit",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onlanguagechange",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadend",
"onloadstart",
"onmessage",
"onmessageerror",
"onmousedown",
"onmouseenter",
"onmouseleave",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onwheel",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpaste",
"onpause",
"onplay",
"onplaying",
"onpopstate",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onrejectionhandled",
"onscroll",
"onsecuritypolicyviolation",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onsort",
"onstalled",
"onstorage",
"onsubmit",
"onsuspend",
"ontimeupdate",
"ontoggle",
"onunhandledrejection",
"onunload",
"onvolumechange",
"onwaiting",
}
// extra are ad-hoc values not covered by any of the lists above.
var extra = []string{
"acronym",
"align",
"annotation",
"annotation-xml",
"applet",
"basefont",
"bgsound",
"big",
"blink",
"center",
"color",
"desc",
"face",
"font",
"foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive.
"foreignobject",
"frame",
"frameset",
"image",
"isindex",
"listing",
"malignmark",
"marquee",
"math",
"mglyph",
"mi",
"mn",
"mo",
"ms",
"mtext",
"nobr",
"noembed",
"noframes",
"plaintext",
"prompt",
"public",
"rb",
"rtc",
"spacer",
"strike",
"svg",
"system",
"tt",
"xmp",
}

127
vendor/golang.org/x/net/html/parse.go generated vendored
View file

@ -630,7 +630,16 @@ func inHeadIM(p *parser) bool {
p.oe.pop()
p.acknowledgeSelfClosingTag()
return true
case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
case a.Noscript:
p.addElement()
if p.scripting {
p.setOriginalIM()
p.im = textIM
} else {
p.im = inHeadNoscriptIM
}
return true
case a.Script, a.Title, a.Noframes, a.Style:
p.addElement()
p.setOriginalIM()
p.im = textIM
@ -692,6 +701,49 @@ func inHeadIM(p *parser) bool {
return false
}
// 12.2.6.4.5.
func inHeadNoscriptIM(p *parser) bool {
switch p.tok.Type {
case DoctypeToken:
// Ignore the token.
return true
case StartTagToken:
switch p.tok.DataAtom {
case a.Html:
return inBodyIM(p)
case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
return inHeadIM(p)
case a.Head, a.Noscript:
// Ignore the token.
return true
}
case EndTagToken:
switch p.tok.DataAtom {
case a.Noscript, a.Br:
default:
// Ignore the token.
return true
}
case TextToken:
s := strings.TrimLeft(p.tok.Data, whitespace)
if len(s) == 0 {
// It was all whitespace.
return inHeadIM(p)
}
case CommentToken:
return inHeadIM(p)
}
p.oe.pop()
if p.top().DataAtom != a.Head {
panic("html: the new current node will be a head element.")
}
p.im = inHeadIM
if p.tok.DataAtom == a.Noscript {
return true
}
return false
}
// Section 12.2.6.4.6.
func afterHeadIM(p *parser) bool {
switch p.tok.Type {
@ -901,7 +953,7 @@ func inBodyIM(p *parser) bool {
case a.A:
for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
p.inBodyEndTagFormatting(a.A)
p.inBodyEndTagFormatting(a.A, "a")
p.oe.remove(n)
p.afe.remove(n)
break
@ -915,7 +967,7 @@ func inBodyIM(p *parser) bool {
case a.Nobr:
p.reconstructActiveFormattingElements()
if p.elementInScope(defaultScope, a.Nobr) {
p.inBodyEndTagFormatting(a.Nobr)
p.inBodyEndTagFormatting(a.Nobr, "nobr")
p.reconstructActiveFormattingElements()
}
p.addFormattingElement()
@ -1123,7 +1175,7 @@ func inBodyIM(p *parser) bool {
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
p.inBodyEndTagFormatting(p.tok.DataAtom)
p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
case a.Applet, a.Marquee, a.Object:
if p.popUntil(defaultScope, p.tok.DataAtom) {
p.clearActiveFormattingElements()
@ -1134,7 +1186,7 @@ func inBodyIM(p *parser) bool {
case a.Template:
return inHeadIM(p)
default:
p.inBodyEndTagOther(p.tok.DataAtom)
p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
}
case CommentToken:
p.addChild(&Node{
@ -1161,7 +1213,7 @@ func inBodyIM(p *parser) bool {
return true
}
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
// This is the "adoption agency" algorithm, described at
// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
@ -1183,7 +1235,7 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
}
}
if formattingElement == nil {
p.inBodyEndTagOther(tagAtom)
p.inBodyEndTagOther(tagAtom, tagName)
return
}
feIndex := p.oe.index(formattingElement)
@ -1288,9 +1340,17 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
for i := len(p.oe) - 1; i >= 0; i-- {
if p.oe[i].DataAtom == tagAtom {
// Two element nodes have the same tag if they have the same Data (a
// string-typed field). As an optimization, for common HTML tags, each
// Data string is assigned a unique, non-zero DataAtom (a uint32-typed
// field), since integer comparison is faster than string comparison.
// Uncommon (custom) tags get a zero DataAtom.
//
// The if condition here is equivalent to (p.oe[i].Data == tagName).
if (p.oe[i].DataAtom == tagAtom) &&
((tagAtom != 0) || (p.oe[i].Data == tagName)) {
p.oe = p.oe[:i]
break
}
@ -1684,8 +1744,9 @@ func inCellIM(p *parser) bool {
return true
}
// Close the cell and reprocess.
p.popUntil(tableScope, a.Td, a.Th)
p.clearActiveFormattingElements()
if p.popUntil(tableScope, a.Td, a.Th) {
p.clearActiveFormattingElements()
}
p.im = inRowIM
return false
}
@ -2239,6 +2300,33 @@ func (p *parser) parse() error {
//
// The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) {
return ParseWithOptions(r)
}
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
//
// It has the same intricacies as Parse.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
return ParseFragmentWithOptions(r, context)
}
// ParseOption configures a parser.
type ParseOption func(p *parser)
// ParseOptionEnableScripting configures the scripting flag.
// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
//
// By default, scripting is enabled.
func ParseOptionEnableScripting(enable bool) ParseOption {
return func(p *parser) {
p.scripting = enable
}
}
// ParseWithOptions is like Parse, with options.
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
@ -2248,6 +2336,11 @@ func Parse(r io.Reader) (*Node, error) {
framesetOK: true,
im: initialIM,
}
for _, f := range opts {
f(p)
}
err := p.parse()
if err != nil {
return nil, err
@ -2255,12 +2348,8 @@ func Parse(r io.Reader) (*Node, error) {
return p.doc, nil
}
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
//
// It has the same intricacies as Parse.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
// ParseFragmentWithOptions is like ParseFragment, with options.
func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
contextTag := ""
if context != nil {
if context.Type != ElementNode {
@ -2284,6 +2373,10 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
context: context,
}
for _, f := range opts {
f(p)
}
root := &Node{
Type: ElementNode,
DataAtom: a.Html,

View file

@ -1,556 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/internal/gen"
)
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./0123456789:;<=>?` +
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
var encodings = []struct {
name string
mib string
comment string
varName string
replacement byte
mapping string
}{
{
"IBM Code Page 037",
"IBM037",
"",
"CodePage037",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
},
{
"IBM Code Page 437",
"PC8CodePage437",
"",
"CodePage437",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
},
{
"IBM Code Page 850",
"PC850Multilingual",
"",
"CodePage850",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
},
{
"IBM Code Page 852",
"PCp852",
"",
"CodePage852",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
},
{
"IBM Code Page 855",
"IBM855",
"",
"CodePage855",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
},
{
"Windows Code Page 858", // PC latin1 with Euro
"IBM00858",
"",
"CodePage858",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
},
{
"IBM Code Page 860",
"IBM860",
"",
"CodePage860",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
},
{
"IBM Code Page 862",
"PC862LatinHebrew",
"",
"CodePage862",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
},
{
"IBM Code Page 863",
"IBM863",
"",
"CodePage863",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
},
{
"IBM Code Page 865",
"IBM865",
"",
"CodePage865",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
},
{
"IBM Code Page 866",
"IBM866",
"",
"CodePage866",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-ibm866.txt",
},
{
"IBM Code Page 1047",
"IBM1047",
"",
"CodePage1047",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
},
{
"IBM Code Page 1140",
"IBM01140",
"",
"CodePage1140",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
},
{
"ISO 8859-1",
"ISOLatin1",
"",
"ISO8859_1",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
},
{
"ISO 8859-2",
"ISOLatin2",
"",
"ISO8859_2",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
},
{
"ISO 8859-3",
"ISOLatin3",
"",
"ISO8859_3",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
},
{
"ISO 8859-4",
"ISOLatin4",
"",
"ISO8859_4",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
},
{
"ISO 8859-5",
"ISOLatinCyrillic",
"",
"ISO8859_5",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
},
{
"ISO 8859-6",
"ISOLatinArabic",
"",
"ISO8859_6,ISO8859_6E,ISO8859_6I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
},
{
"ISO 8859-7",
"ISOLatinGreek",
"",
"ISO8859_7",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
},
{
"ISO 8859-8",
"ISOLatinHebrew",
"",
"ISO8859_8,ISO8859_8E,ISO8859_8I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
},
{
"ISO 8859-9",
"ISOLatin5",
"",
"ISO8859_9",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
},
{
"ISO 8859-10",
"ISOLatin6",
"",
"ISO8859_10",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
},
{
"ISO 8859-13",
"ISO885913",
"",
"ISO8859_13",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
},
{
"ISO 8859-14",
"ISO885914",
"",
"ISO8859_14",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
},
{
"ISO 8859-15",
"ISO885915",
"",
"ISO8859_15",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
},
{
"ISO 8859-16",
"ISO885916",
"",
"ISO8859_16",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
},
{
"KOI8-R",
"KOI8R",
"",
"KOI8R",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
},
{
"KOI8-U",
"KOI8U",
"",
"KOI8U",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
},
{
"Macintosh",
"Macintosh",
"",
"Macintosh",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-macintosh.txt",
},
{
"Macintosh Cyrillic",
"MacintoshCyrillic",
"",
"MacintoshCyrillic",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
},
{
"Windows 874",
"Windows874",
"",
"Windows874",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-874.txt",
},
{
"Windows 1250",
"Windows1250",
"",
"Windows1250",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
},
{
"Windows 1251",
"Windows1251",
"",
"Windows1251",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
},
{
"Windows 1252",
"Windows1252",
"",
"Windows1252",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
},
{
"Windows 1253",
"Windows1253",
"",
"Windows1253",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
},
{
"Windows 1254",
"Windows1254",
"",
"Windows1254",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
},
{
"Windows 1255",
"Windows1255",
"",
"Windows1255",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
},
{
"Windows 1256",
"Windows1256",
"",
"Windows1256",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
},
{
"Windows 1257",
"Windows1257",
"",
"Windows1257",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
},
{
"Windows 1258",
"Windows1258",
"",
"Windows1258",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
},
{
"X-User-Defined",
"XUserDefined",
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
"XUserDefined",
encoding.ASCIISub,
ascii +
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
},
}
func getWHATWG(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 128)
for i := range mapping {
mapping[i] = '\ufffd'
}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, 0
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 128 <= x {
log.Fatalf("code %d is out of range", x)
}
if 0x80 <= y && y < 0xa0 {
// We diverge from the WHATWG spec by mapping control characters
// in the range [0x80, 0xa0) to U+FFFD.
continue
}
mapping[x] = rune(y)
}
return ascii + string(mapping)
}
func getUCM(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 256)
for i := range mapping {
mapping[i] = '\ufffd'
}
charsFound := 0
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
var c byte
var r rune
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
continue
}
mapping[c] = r
charsFound++
}
if charsFound < 200 {
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
}
return string(mapping)
}
func main() {
mibs := map[string]bool{}
all := []string{}
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "charmap")
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
printf("import (\n")
printf("\t\"golang.org/x/text/encoding\"\n")
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
printf(")\n\n")
for _, e := range encodings {
varNames := strings.Split(e.varName, ",")
all = append(all, varNames...)
varName := varNames[0]
switch {
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
e.mapping = getWHATWG(e.mapping)
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
e.mapping = getUCM(e.mapping)
}
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
if asciiSuperset {
low = 0x80
}
lvn := 1
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
lvn = 3
}
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
printf("// %s is the %s encoding.\n", varName, e.name)
if e.comment != "" {
printf("//\n// %s\n", e.comment)
}
printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
varName, lowerVarName, lowerVarName, e.name)
if mibs[e.mib] {
log.Fatalf("MIB type %q declared multiple times.", e.mib)
}
printf("mib: identifier.%s,\n", e.mib)
printf("asciiSuperset: %t,\n", asciiSuperset)
printf("low: 0x%02x,\n", low)
printf("replacement: 0x%02x,\n", e.replacement)
printf("decode: [256]utf8Enc{\n")
i, backMapping := 0, map[rune]byte{}
for _, c := range e.mapping {
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
backMapping[c] = byte(i)
}
var buf [8]byte
n := utf8.EncodeRune(buf[:], c)
if n > 3 {
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
}
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
if i%2 == 1 {
printf("\n")
}
i++
}
printf("},\n")
printf("encode: [256]uint32{\n")
encode := make([]uint32, 0, 256)
for c, i := range backMapping {
encode = append(encode, uint32(i)<<24|uint32(c))
}
sort.Sort(byRune(encode))
for len(encode) < cap(encode) {
encode = append(encode, encode[len(encode)-1])
}
for i, enc := range encode {
printf("0x%08x,", enc)
if i%8 == 7 {
printf("\n")
}
}
printf("},\n}\n")
// Add an estimate of the size of a single Charmap{} struct value, which
// includes two 256 elem arrays of 4 bytes and some extra fields, which
// align to 3 uint64s on 64-bit architectures.
w.Size += 2*4*256 + 3*8
}
// TODO: add proper line breaking.
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
}
type byRune []uint32
func (b byRune) Len() int { return len(b) }
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,173 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type group struct {
Encodings []struct {
Labels []string
Name string
}
}
func main() {
gen.Init()
r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
var groups []group
if err := json.NewDecoder(r).Decode(&groups); err != nil {
log.Fatalf("Error reading encodings.json: %v", err)
}
w := &bytes.Buffer{}
fmt.Fprintln(w, "type htmlEncoding byte")
fmt.Fprintln(w, "const (")
for i, g := range groups {
for _, e := range g.Encodings {
key := strings.ToLower(e.Name)
name := consts[key]
if name == "" {
log.Fatalf("No const defined for %s.", key)
}
if i == 0 {
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
} else {
fmt.Fprintf(w, "%s\n", name)
}
}
}
fmt.Fprintln(w, "numEncodings")
fmt.Fprint(w, ")\n\n")
fmt.Fprintln(w, "var canonical = [numEncodings]string{")
for _, g := range groups {
for _, e := range g.Encodings {
fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
}
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
for _, g := range groups {
for _, e := range g.Encodings {
for _, l := range e.Labels {
key := strings.ToLower(e.Name)
name := consts[key]
fmt.Fprintf(w, "%q: %s,\n", l, name)
}
}
}
fmt.Fprint(w, "}\n\n")
var tags []string
fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
for _, loc := range locales {
tags = append(tags, loc.tag)
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
}
// consts maps canonical encoding name to internal constant.
var consts = map[string]string{
"utf-8": "utf8",
"ibm866": "ibm866",
"iso-8859-2": "iso8859_2",
"iso-8859-3": "iso8859_3",
"iso-8859-4": "iso8859_4",
"iso-8859-5": "iso8859_5",
"iso-8859-6": "iso8859_6",
"iso-8859-7": "iso8859_7",
"iso-8859-8": "iso8859_8",
"iso-8859-8-i": "iso8859_8I",
"iso-8859-10": "iso8859_10",
"iso-8859-13": "iso8859_13",
"iso-8859-14": "iso8859_14",
"iso-8859-15": "iso8859_15",
"iso-8859-16": "iso8859_16",
"koi8-r": "koi8r",
"koi8-u": "koi8u",
"macintosh": "macintosh",
"windows-874": "windows874",
"windows-1250": "windows1250",
"windows-1251": "windows1251",
"windows-1252": "windows1252",
"windows-1253": "windows1253",
"windows-1254": "windows1254",
"windows-1255": "windows1255",
"windows-1256": "windows1256",
"windows-1257": "windows1257",
"windows-1258": "windows1258",
"x-mac-cyrillic": "macintoshCyrillic",
"gbk": "gbk",
"gb18030": "gb18030",
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
"big5": "big5",
"euc-jp": "eucjp",
"iso-2022-jp": "iso2022jp",
"shift_jis": "shiftJIS",
"euc-kr": "euckr",
"replacement": "replacement",
"utf-16be": "utf16be",
"utf-16le": "utf16le",
"x-user-defined": "xUserDefined",
}
// locales is taken from
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
var locales = []struct{ tag, name string }{
// The default value. Explicitly state latin to benefit from the exact
// script option, while still making 1252 the default encoding for languages
// written in Latin script.
{"und_Latn", "windows-1252"},
{"ar", "windows-1256"},
{"ba", "windows-1251"},
{"be", "windows-1251"},
{"bg", "windows-1251"},
{"cs", "windows-1250"},
{"el", "iso-8859-7"},
{"et", "windows-1257"},
{"fa", "windows-1256"},
{"he", "windows-1255"},
{"hr", "windows-1250"},
{"hu", "iso-8859-2"},
{"ja", "shift_jis"},
{"kk", "windows-1251"},
{"ko", "euc-kr"},
{"ku", "windows-1254"},
{"ky", "windows-1251"},
{"lt", "windows-1257"},
{"lv", "windows-1257"},
{"mk", "windows-1251"},
{"pl", "iso-8859-2"},
{"ru", "windows-1251"},
{"sah", "windows-1251"},
{"sk", "windows-1250"},
{"sl", "iso-8859-2"},
{"sr", "windows-1251"},
{"tg", "windows-1251"},
{"th", "windows-874"},
{"tr", "windows-1254"},
{"tt", "windows-1251"},
{"uk", "windows-1251"},
{"vi", "windows-1258"},
{"zh-hans", "gb18030"},
{"zh-hant", "big5"},
}

View file

@ -1,137 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type registry struct {
XMLName xml.Name `xml:"registry"`
Updated string `xml:"updated"`
Registry []struct {
ID string `xml:"id,attr"`
Record []struct {
Name string `xml:"name"`
Xref []struct {
Type string `xml:"type,attr"`
Data string `xml:"data,attr"`
} `xml:"xref"`
Desc struct {
Data string `xml:",innerxml"`
// Any []struct {
// Data string `xml:",chardata"`
// } `xml:",any"`
// Data string `xml:",chardata"`
} `xml:"description,"`
MIB string `xml:"value"`
Alias []string `xml:"alias"`
MIME string `xml:"preferred_alias"`
} `xml:"record"`
} `xml:"registry"`
}
func main() {
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
reg := &registry{}
if err := xml.NewDecoder(r).Decode(&reg); err != nil && err != io.EOF {
log.Fatalf("Error decoding charset registry: %v", err)
}
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
}
w := &bytes.Buffer{}
fmt.Fprintf(w, "const (\n")
for _, rec := range reg.Registry[0].Record {
constName := ""
for _, a := range rec.Alias {
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
// Some of the constant definitions have comments in them. Strip those.
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
}
}
if constName == "" {
switch rec.MIB {
case "2085":
constName = "HZGB2312" // Not listed as alias for some reason.
default:
log.Fatalf("No cs alias defined for %s.", rec.MIB)
}
}
if rec.MIME != "" {
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME)
}
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME)
if len(rec.Desc.Data) > 0 {
fmt.Fprint(w, "// ")
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data))
inElem := true
attr := ""
for {
t, err := d.Token()
if err != nil {
if err != io.EOF {
log.Fatal(err)
}
break
}
switch x := t.(type) {
case xml.CharData:
attr = "" // Don't need attribute info.
a := bytes.Split([]byte(x), []byte("\n"))
for i, b := range a {
if b = bytes.TrimSpace(b); len(b) != 0 {
if !inElem && i > 0 {
fmt.Fprint(w, "\n// ")
}
inElem = false
fmt.Fprintf(w, "%s ", string(b))
}
}
case xml.StartElement:
if x.Name.Local == "xref" {
inElem = true
use := false
for _, a := range x.Attr {
if a.Name.Local == "type" {
use = use || a.Value != "person"
}
if a.Name.Local == "data" && use {
attr = a.Value + " "
}
}
}
case xml.EndElement:
inElem = false
fmt.Fprint(w, attr)
}
}
fmt.Fprint(w, "\n")
}
for _, x := range rec.Xref {
switch x.Type {
case "rfc":
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data))
case "uri":
fmt.Fprintf(w, "// Reference: %s\n", x.Data)
}
}
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB)
fmt.Fprintln(w)
}
fmt.Fprintln(w, ")")
gen.WriteGoFile("mib.go", "identifier", w.Bytes())
}

View file

@ -538,8 +538,6 @@ const (
// ISO111ECMACyrillic is the MIB identifier with IANA name ECMA-cyrillic.
//
// ISO registry
// (formerly ECMA
// registry )
ISO111ECMACyrillic MIB = 77
// ISO121Canadian1 is the MIB identifier with IANA name CSA_Z243.4-1985-1.
@ -732,18 +730,18 @@ const (
// ISO885913 is the MIB identifier with IANA name ISO-8859-13.
//
// ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-13 http://www.iana.org/assignments/charset-reg/ISO-8859-13
// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-13 https://www.iana.org/assignments/charset-reg/ISO-8859-13
ISO885913 MIB = 109
// ISO885914 is the MIB identifier with IANA name ISO-8859-14.
//
// ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-14
// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-14
ISO885914 MIB = 110
// ISO885915 is the MIB identifier with IANA name ISO-8859-15.
//
// ISO
// Please see: http://www.iana.org/assignments/charset-reg/ISO-8859-15
// Please see: https://www.iana.org/assignments/charset-reg/ISO-8859-15
ISO885915 MIB = 111
// ISO885916 is the MIB identifier with IANA name ISO-8859-16.
@ -754,41 +752,41 @@ const (
// GBK is the MIB identifier with IANA name GBK.
//
// Chinese IT Standardization Technical Committee
// Please see: http://www.iana.org/assignments/charset-reg/GBK
// Please see: https://www.iana.org/assignments/charset-reg/GBK
GBK MIB = 113
// GB18030 is the MIB identifier with IANA name GB18030.
//
// Chinese IT Standardization Technical Committee
// Please see: http://www.iana.org/assignments/charset-reg/GB18030
// Please see: https://www.iana.org/assignments/charset-reg/GB18030
GB18030 MIB = 114
// OSDEBCDICDF0415 is the MIB identifier with IANA name OSD_EBCDIC_DF04_15.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15
OSDEBCDICDF0415 MIB = 115
// OSDEBCDICDF03IRV is the MIB identifier with IANA name OSD_EBCDIC_DF03_IRV.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV
OSDEBCDICDF03IRV MIB = 116
// OSDEBCDICDF041 is the MIB identifier with IANA name OSD_EBCDIC_DF04_1.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1
OSDEBCDICDF041 MIB = 117
// ISO115481 is the MIB identifier with IANA name ISO-11548-1.
//
// See http://www.iana.org/assignments/charset-reg/ISO-11548-1
// See https://www.iana.org/assignments/charset-reg/ISO-11548-1
ISO115481 MIB = 118
// KZ1048 is the MIB identifier with IANA name KZ-1048.
//
// See http://www.iana.org/assignments/charset-reg/KZ-1048
// See https://www.iana.org/assignments/charset-reg/KZ-1048
KZ1048 MIB = 119
// Unicode is the MIB identifier with IANA name ISO-10646-UCS-2.
@ -855,7 +853,7 @@ const (
// SCSU is the MIB identifier with IANA name SCSU.
//
// SCSU See http://www.iana.org/assignments/charset-reg/SCSU
// SCSU See https://www.iana.org/assignments/charset-reg/SCSU
SCSU MIB = 1011
// UTF7 is the MIB identifier with IANA name UTF-7.
@ -884,22 +882,22 @@ const (
// CESU8 is the MIB identifier with IANA name CESU-8.
//
// https://www.unicode.org/unicode/reports/tr26
// https://www.unicode.org/reports/tr26
CESU8 MIB = 1016
// UTF32 is the MIB identifier with IANA name UTF-32.
//
// https://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32 MIB = 1017
// UTF32BE is the MIB identifier with IANA name UTF-32BE.
//
// https://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32BE MIB = 1018
// UTF32LE is the MIB identifier with IANA name UTF-32LE.
//
// https://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32LE MIB = 1019
// BOCU1 is the MIB identifier with IANA name BOCU-1.
@ -1461,152 +1459,152 @@ const (
// IBM00858 is the MIB identifier with IANA name IBM00858.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM00858
// IBM See https://www.iana.org/assignments/charset-reg/IBM00858
IBM00858 MIB = 2089
// IBM00924 is the MIB identifier with IANA name IBM00924.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM00924
// IBM See https://www.iana.org/assignments/charset-reg/IBM00924
IBM00924 MIB = 2090
// IBM01140 is the MIB identifier with IANA name IBM01140.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01140
// IBM See https://www.iana.org/assignments/charset-reg/IBM01140
IBM01140 MIB = 2091
// IBM01141 is the MIB identifier with IANA name IBM01141.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01141
// IBM See https://www.iana.org/assignments/charset-reg/IBM01141
IBM01141 MIB = 2092
// IBM01142 is the MIB identifier with IANA name IBM01142.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01142
// IBM See https://www.iana.org/assignments/charset-reg/IBM01142
IBM01142 MIB = 2093
// IBM01143 is the MIB identifier with IANA name IBM01143.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01143
// IBM See https://www.iana.org/assignments/charset-reg/IBM01143
IBM01143 MIB = 2094
// IBM01144 is the MIB identifier with IANA name IBM01144.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01144
// IBM See https://www.iana.org/assignments/charset-reg/IBM01144
IBM01144 MIB = 2095
// IBM01145 is the MIB identifier with IANA name IBM01145.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01145
// IBM See https://www.iana.org/assignments/charset-reg/IBM01145
IBM01145 MIB = 2096
// IBM01146 is the MIB identifier with IANA name IBM01146.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01146
// IBM See https://www.iana.org/assignments/charset-reg/IBM01146
IBM01146 MIB = 2097
// IBM01147 is the MIB identifier with IANA name IBM01147.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01147
// IBM See https://www.iana.org/assignments/charset-reg/IBM01147
IBM01147 MIB = 2098
// IBM01148 is the MIB identifier with IANA name IBM01148.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01148
// IBM See https://www.iana.org/assignments/charset-reg/IBM01148
IBM01148 MIB = 2099
// IBM01149 is the MIB identifier with IANA name IBM01149.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01149
// IBM See https://www.iana.org/assignments/charset-reg/IBM01149
IBM01149 MIB = 2100
// Big5HKSCS is the MIB identifier with IANA name Big5-HKSCS.
//
// See http://www.iana.org/assignments/charset-reg/Big5-HKSCS
// See https://www.iana.org/assignments/charset-reg/Big5-HKSCS
Big5HKSCS MIB = 2101
// IBM1047 is the MIB identifier with IANA name IBM1047.
//
// IBM1047 (EBCDIC Latin 1/Open Systems) http://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf
// IBM1047 (EBCDIC Latin 1/Open Systems) https://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf
IBM1047 MIB = 2102
// PTCP154 is the MIB identifier with IANA name PTCP154.
//
// See http://www.iana.org/assignments/charset-reg/PTCP154
// See https://www.iana.org/assignments/charset-reg/PTCP154
PTCP154 MIB = 2103
// Amiga1251 is the MIB identifier with IANA name Amiga-1251.
//
// See http://www.amiga.ultranet.ru/Amiga-1251.html
// See https://www.amiga.ultranet.ru/Amiga-1251.html
Amiga1251 MIB = 2104
// KOI7switched is the MIB identifier with IANA name KOI7-switched.
//
// See http://www.iana.org/assignments/charset-reg/KOI7-switched
// See https://www.iana.org/assignments/charset-reg/KOI7-switched
KOI7switched MIB = 2105
// BRF is the MIB identifier with IANA name BRF.
//
// See http://www.iana.org/assignments/charset-reg/BRF
// See https://www.iana.org/assignments/charset-reg/BRF
BRF MIB = 2106
// TSCII is the MIB identifier with IANA name TSCII.
//
// See http://www.iana.org/assignments/charset-reg/TSCII
// See https://www.iana.org/assignments/charset-reg/TSCII
TSCII MIB = 2107
// CP51932 is the MIB identifier with IANA name CP51932.
//
// See http://www.iana.org/assignments/charset-reg/CP51932
// See https://www.iana.org/assignments/charset-reg/CP51932
CP51932 MIB = 2108
// Windows874 is the MIB identifier with IANA name windows-874.
//
// See http://www.iana.org/assignments/charset-reg/windows-874
// See https://www.iana.org/assignments/charset-reg/windows-874
Windows874 MIB = 2109
// Windows1250 is the MIB identifier with IANA name windows-1250.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1250
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1250
Windows1250 MIB = 2250
// Windows1251 is the MIB identifier with IANA name windows-1251.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1251
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1251
Windows1251 MIB = 2251
// Windows1252 is the MIB identifier with IANA name windows-1252.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1252
Windows1252 MIB = 2252
// Windows1253 is the MIB identifier with IANA name windows-1253.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1253
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1253
Windows1253 MIB = 2253
// Windows1254 is the MIB identifier with IANA name windows-1254.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1254
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1254
Windows1254 MIB = 2254
// Windows1255 is the MIB identifier with IANA name windows-1255.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1255
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1255
Windows1255 MIB = 2255
// Windows1256 is the MIB identifier with IANA name windows-1256.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1256
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1256
Windows1256 MIB = 2256
// Windows1257 is the MIB identifier with IANA name windows-1257.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1257
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1257
Windows1257 MIB = 2257
// Windows1258 is the MIB identifier with IANA name windows-1258.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1258
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1258
Windows1258 MIB = 2258
// TIS620 is the MIB identifier with IANA name TIS-620.
@ -1616,6 +1614,6 @@ const (
// CP50220 is the MIB identifier with IANA name CP50220.
//
// See http://www.iana.org/assignments/charset-reg/CP50220
// See https://www.iana.org/assignments/charset-reg/CP50220
CP50220 MIB = 2260
)

View file

@ -1,161 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
// TODO: Emoji extensions?
// https://www.unicode.org/faq/emoji_dingbats.html
// https://www.unicode.org/Public/UNIDATA/EmojiSources.txt
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
type entry struct {
jisCode, table int
}
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
reverse := [65536]entry{}
for i := range reverse {
reverse[i].table = -1
}
tables := []struct {
url string
name string
}{
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
}
for i, table := range tables {
res, err := http.Get(table.url)
if err != nil {
log.Fatalf("%q: Get: %v", table.url, err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("%q: could not parse %q", table.url, s)
}
if x < 0 || 120*94 <= x {
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
}
mapping[x] = y
if reverse[y].table == -1 {
reverse[y] = entry{jisCode: x, table: i}
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("%q: scanner error: %v", table.url, err)
}
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
table.name, table.name, table.url)
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
for i, m := range mapping {
if m != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, m)
}
}
fmt.Printf("}\n\n")
}
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v.table == -1 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const (\n")
fmt.Printf("\tjis0208 = 1\n")
fmt.Printf("\tjis0212 = 2\n")
fmt.Printf("\tcodeMask = 0x7f\n")
fmt.Printf("\tcodeShift = 7\n")
fmt.Printf("\ttableShift = 14\n")
fmt.Printf(")\n\n")
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("//\n")
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x.table == -1 {
continue
}
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,143 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
log.Fatalf("EUC-KR code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := uint16(0), uint16(0)
if x < 178*(0xc7-0x81) {
c0 = uint16(x/178) + 0x81
c1 = uint16(x % 178)
switch {
case c1 < 1*26:
c1 += 0x41
case c1 < 2*26:
c1 += 0x47
default:
c1 += 0x4d
}
} else {
x -= 178 * (0xc7 - 0x81)
c0 = uint16(x/94) + 0xc7
c1 = uint16(x%94) + 0xa1
}
reverse[y] = c0<<8 | c1
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,161 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
printGB18030()
printGBK()
}
func printGB18030() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
fmt.Printf("var gb18030 = [...][2]uint16{\n")
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint32(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0x10000 && y < 0x10000 {
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
}
}
fmt.Printf("}\n\n")
}
func printGBK() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*190 <= x {
log.Fatalf("GBK code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := x/190, x%190
if c1 >= 0x3f {
c1++
}
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,140 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint32{}
reverse := [65536 * 4]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*157 <= x {
log.Fatalf("Big5 code %d is out of range", x)
}
mapping[x] = y
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
// "The index pointer for code point in index is the first pointer
// corresponding to code point in index", which would normally mean
// that the code below should be guarded by "if reverse[y] == 0", but
// last instead of first seems to match the behavior of
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
c0, c1 := x/157, x%157
if c1 < 0x3f {
c1 += 0x40
} else {
c1 += 0x62
}
reverse[y] = (0x81+c0)<<8 | c1
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
fmt.Printf("var decode = [...]uint32{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%08X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,64 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Language tag table generator.
// Data read from the web.
package main
import (
"flag"
"fmt"
"log"
"golang.org/x/text/internal/gen"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output",
"tables.go",
"output file for generated tables")
)
func main() {
gen.Init()
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "compact")
fmt.Fprintln(w, `import "golang.org/x/text/internal/language"`)
b := newBuilder(w)
gen.WriteCLDRVersion(w)
b.writeCompactIndex()
}
type builder struct {
w *gen.CodeWriter
data *cldr.CLDR
supp *cldr.SupplementalData
}
func newBuilder(w *gen.CodeWriter) *builder {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatal(err)
}
b := builder{
w: w,
data: data,
supp: data.Supplemental(),
}
return &b
}

View file

@ -1,113 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file generates derivative tables based on the language package itself.
import (
"fmt"
"log"
"sort"
"strings"
"golang.org/x/text/internal/language"
)
// Compact indices:
// Note -va-X variants only apply to localization variants.
// BCP variants only ever apply to language.
// The only ambiguity between tags is with regions.
func (b *builder) writeCompactIndex() {
// Collect all language tags for which we have any data in CLDR.
m := map[language.Tag]bool{}
for _, lang := range b.data.Locales() {
// We include all locales unconditionally to be consistent with en_US.
// We want en_US, even though it has no data associated with it.
// TODO: put any of the languages for which no data exists at the end
// of the index. This allows all components based on ICU to use that
// as the cutoff point.
// if x := data.RawLDML(lang); false ||
// x.LocaleDisplayNames != nil ||
// x.Characters != nil ||
// x.Delimiters != nil ||
// x.Measurement != nil ||
// x.Dates != nil ||
// x.Numbers != nil ||
// x.Units != nil ||
// x.ListPatterns != nil ||
// x.Collations != nil ||
// x.Segmentations != nil ||
// x.Rbnf != nil ||
// x.Annotations != nil ||
// x.Metadata != nil {
// TODO: support POSIX natively, albeit non-standard.
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
m[tag] = true
// }
}
// TODO: plural rules are also defined for the deprecated tags:
// iw mo sh tl
// Consider removing these as compact tags.
// Include locales for plural rules, which uses a different structure.
for _, plurals := range b.supp.Plurals {
for _, rules := range plurals.PluralRules {
for _, lang := range strings.Split(rules.Locales, " ") {
m[language.Make(lang)] = true
}
}
}
var coreTags []language.CompactCoreInfo
var special []string
for t := range m {
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
log.Fatalf("Unexpected extension %v in %v", x, t)
}
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
cci, ok := language.GetCompactCore(t)
if !ok {
log.Fatalf("Locale for non-basic language %q", t)
}
coreTags = append(coreTags, cci)
} else {
special = append(special, t.String())
}
}
w := b.w
sort.Slice(coreTags, func(i, j int) bool { return coreTags[i] < coreTags[j] })
sort.Strings(special)
w.WriteComment(`
NumCompactTags is the number of common tags. The maximum tag is
NumCompactTags-1.`)
w.WriteConst("NumCompactTags", len(m))
fmt.Fprintln(w, "const (")
for i, t := range coreTags {
fmt.Fprintf(w, "%s ID = %d\n", ident(t.Tag().String()), i)
}
for i, t := range special {
fmt.Fprintf(w, "%s ID = %d\n", ident(t), i+len(coreTags))
}
fmt.Fprintln(w, ")")
w.WriteVar("coreTags", coreTags)
w.WriteConst("specialTagsStr", strings.Join(special, " "))
}
func ident(s string) string {
return strings.Replace(s, "-", "", -1) + "Index"
}

View file

@ -1,54 +0,0 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"log"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/language"
"golang.org/x/text/internal/language/compact"
"golang.org/x/text/unicode/cldr"
)
func main() {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer w.WriteGoFile("parents.go", "compact")
// Create parents table.
type ID uint16
parents := make([]ID, compact.NumCompactTags)
for _, loc := range data.Locales() {
tag := language.MustParse(loc)
index, ok := compact.FromTag(tag)
if !ok {
continue
}
parentIndex := compact.ID(0) // und
for p := tag.Parent(); p != language.Und; p = p.Parent() {
if x, ok := compact.FromTag(p); ok {
parentIndex = x
break
}
}
parents[index] = ID(parentIndex)
}
w.WriteComment(`
parents maps a compact index of a tag to the compact index of the parent of
this tag.`)
w.WriteVar("parents", parents)
}

File diff suppressed because it is too large Load diff

View file

@ -1,20 +0,0 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file contains code common to the maketables.go and the package code.
// AliasType is the type of an alias in AliasMap.
type AliasType int8
const (
Deprecated AliasType = iota
Macro
Legacy
AliasTypeUnknown AliasType = -1
)

View file

@ -1,305 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Language tag table generator.
// Data read from the web.
package main
import (
"flag"
"fmt"
"io"
"log"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output",
"tables.go",
"output file for generated tables")
)
func main() {
gen.Init()
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "language")
b := newBuilder(w)
gen.WriteCLDRVersion(w)
b.writeConstants()
b.writeMatchData()
}
type builder struct {
w *gen.CodeWriter
hw io.Writer // MultiWriter for w and w.Hash
data *cldr.CLDR
supp *cldr.SupplementalData
}
func (b *builder) langIndex(s string) uint16 {
return uint16(language.MustParseBase(s))
}
func (b *builder) regionIndex(s string) int {
return int(language.MustParseRegion(s))
}
func (b *builder) scriptIndex(s string) int {
return int(language.MustParseScript(s))
}
func newBuilder(w *gen.CodeWriter) *builder {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatal(err)
}
b := builder{
w: w,
hw: io.MultiWriter(w, w.Hash),
data: data,
supp: data.Supplemental(),
}
return &b
}
// writeConsts computes f(v) for all v in values and writes the results
// as constants named _v to a single constant block.
func (b *builder) writeConsts(f func(string) int, values ...string) {
fmt.Fprintln(b.w, "const (")
for _, v := range values {
fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
}
fmt.Fprintln(b.w, ")")
}
// TODO: region inclusion data will probably not be use used in future matchers.
var langConsts = []string{
"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
}
var scriptConsts = []string{
"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
"Zzzz",
}
var regionConsts = []string{
"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
}
func (b *builder) writeConstants() {
b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
b.writeConsts(b.regionIndex, regionConsts...)
b.writeConsts(b.scriptIndex, scriptConsts...)
}
type mutualIntelligibility struct {
want, have uint16
distance uint8
oneway bool
}
type scriptIntelligibility struct {
wantLang, haveLang uint16
wantScript, haveScript uint8
distance uint8
// Always oneway
}
type regionIntelligibility struct {
lang uint16 // compact language id
script uint8 // 0 means any
group uint8 // 0 means any; if bit 7 is set it means inverse
distance uint8
// Always twoway.
}
// writeMatchData writes tables with languages and scripts for which there is
// mutual intelligibility. The data is based on CLDR's languageMatching data.
// Note that we use a different algorithm than the one defined by CLDR and that
// we slightly modify the data. For example, we convert scores to confidence levels.
// We also drop all region-related data as we use a different algorithm to
// determine region equivalence.
func (b *builder) writeMatchData() {
lm := b.supp.LanguageMatching.LanguageMatches
cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
regionHierarchy := map[string][]string{}
for _, g := range b.supp.TerritoryContainment.Group {
regions := strings.Split(g.Contains, " ")
regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
}
regionToGroups := make([]uint8, language.NumRegions)
idToIndex := map[string]uint8{}
for i, mv := range lm[0].MatchVariable {
if i > 6 {
log.Fatalf("Too many groups: %d", i)
}
idToIndex[mv.Id] = uint8(i + 1)
// TODO: also handle '-'
for _, r := range strings.Split(mv.Value, "+") {
todo := []string{r}
for k := 0; k < len(todo); k++ {
r := todo[k]
regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
todo = append(todo, regionHierarchy[r]...)
}
}
}
b.w.WriteVar("regionToGroups", regionToGroups)
// maps language id to in- and out-of-group region.
paradigmLocales := [][3]uint16{}
locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
for i := 0; i < len(locales); i += 2 {
x := [3]uint16{}
for j := 0; j < 2; j++ {
pc := strings.SplitN(locales[i+j], "-", 2)
x[0] = b.langIndex(pc[0])
if len(pc) == 2 {
x[1+j] = uint16(b.regionIndex(pc[1]))
}
}
paradigmLocales = append(paradigmLocales, x)
}
b.w.WriteVar("paradigmLocales", paradigmLocales)
b.w.WriteType(mutualIntelligibility{})
b.w.WriteType(scriptIntelligibility{})
b.w.WriteType(regionIntelligibility{})
matchLang := []mutualIntelligibility{}
matchScript := []scriptIntelligibility{}
matchRegion := []regionIntelligibility{}
// Convert the languageMatch entries in lists keyed by desired language.
for _, m := range lm[0].LanguageMatch {
// Different versions of CLDR use different separators.
desired := strings.Replace(m.Desired, "-", "_", -1)
supported := strings.Replace(m.Supported, "-", "_", -1)
d := strings.Split(desired, "_")
s := strings.Split(supported, "_")
if len(d) != len(s) {
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
continue
}
distance, _ := strconv.ParseInt(m.Distance, 10, 8)
switch len(d) {
case 2:
if desired == supported && desired == "*_*" {
continue
}
// language-script pair.
matchScript = append(matchScript, scriptIntelligibility{
wantLang: uint16(b.langIndex(d[0])),
haveLang: uint16(b.langIndex(s[0])),
wantScript: uint8(b.scriptIndex(d[1])),
haveScript: uint8(b.scriptIndex(s[1])),
distance: uint8(distance),
})
if m.Oneway != "true" {
matchScript = append(matchScript, scriptIntelligibility{
wantLang: uint16(b.langIndex(s[0])),
haveLang: uint16(b.langIndex(d[0])),
wantScript: uint8(b.scriptIndex(s[1])),
haveScript: uint8(b.scriptIndex(d[1])),
distance: uint8(distance),
})
}
case 1:
if desired == supported && desired == "*" {
continue
}
if distance == 1 {
// nb == no is already handled by macro mapping. Check there
// really is only this case.
if d[0] != "no" || s[0] != "nb" {
log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
}
continue
}
// TODO: consider dropping oneway field and just doubling the entry.
matchLang = append(matchLang, mutualIntelligibility{
want: uint16(b.langIndex(d[0])),
have: uint16(b.langIndex(s[0])),
distance: uint8(distance),
oneway: m.Oneway == "true",
})
case 3:
if desired == supported && desired == "*_*_*" {
continue
}
if desired != supported {
// This is now supported by CLDR, but only one case, which
// should already be covered by paradigm locales. For instance,
// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
// testdata/CLDRLocaleMatcherTest.txt tests this.
if supported != "en_*_GB" {
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
}
continue
}
ri := regionIntelligibility{
lang: b.langIndex(d[0]),
distance: uint8(distance),
}
if d[1] != "*" {
ri.script = uint8(b.scriptIndex(d[1]))
}
switch {
case d[2] == "*":
ri.group = 0x80 // not contained in anything
case strings.HasPrefix(d[2], "$!"):
ri.group = 0x80
d[2] = "$" + d[2][len("$!"):]
fallthrough
case strings.HasPrefix(d[2], "$"):
ri.group |= idToIndex[d[2]]
}
matchRegion = append(matchRegion, ri)
default:
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
}
}
sort.SliceStable(matchLang, func(i, j int) bool {
return matchLang[i].distance < matchLang[j].distance
})
b.w.WriteComment(`
matchLang holds pairs of langIDs of base languages that are typically
mutually intelligible. Each pair is associated with a confidence and
whether the intelligibility goes one or both ways.`)
b.w.WriteVar("matchLang", matchLang)
b.w.WriteComment(`
matchScript holds pairs of scriptIDs where readers of one script
can typically also read the other. Each is associated with a confidence.`)
sort.SliceStable(matchScript, func(i, j int) bool {
return matchScript[i].distance < matchScript[j].distance
})
b.w.WriteVar("matchScript", matchScript)
sort.SliceStable(matchRegion, func(i, j int) bool {
return matchRegion[i].distance < matchRegion[j].distance
})
b.w.WriteVar("matchRegion", matchRegion)
}

View file

@ -530,7 +530,7 @@ func (r Region) String() string {
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r Region) ISO3() string {
return r.regionID.String()
return r.regionID.ISO3()
}
// M49 returns the UN M.49 encoding of r, or 0 if this encoding

View file

@ -493,7 +493,7 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
return dstL.n, srcL.p, err
}
// Deprecated: use runes.Remove instead.
// Deprecated: Use runes.Remove instead.
func RemoveFunc(f func(r rune) bool) Transformer {
return removeF(f)
}

View file

@ -15,8 +15,51 @@ import (
// Options provide toggles and overrides to control specific rendering behaviors.
type Options struct {
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
}
// PrettyTablesOptions overrides tablewriter behaviors
type PrettyTablesOptions struct {
AutoFormatHeader bool
AutoWrapText bool
ReflowDuringAutoWrap bool
ColWidth int
ColumnSeparator string
RowSeparator string
CenterSeparator string
HeaderAlignment int
FooterAlignment int
Alignment int
ColumnAlignment []int
NewLine string
HeaderLine bool
RowLine bool
AutoMergeCells bool
Borders tablewriter.Border
}
// NewPrettyTablesOptions creates PrettyTablesOptions with default settings
func NewPrettyTablesOptions() *PrettyTablesOptions {
return &PrettyTablesOptions{
AutoFormatHeader: true,
AutoWrapText: true,
ReflowDuringAutoWrap: true,
ColWidth: tablewriter.MAX_ROW_WIDTH,
ColumnSeparator: tablewriter.COLUMN,
RowSeparator: tablewriter.ROW,
CenterSeparator: tablewriter.CENTER,
HeaderAlignment: tablewriter.ALIGN_DEFAULT,
FooterAlignment: tablewriter.ALIGN_DEFAULT,
Alignment: tablewriter.ALIGN_DEFAULT,
ColumnAlignment: []int{},
NewLine: tablewriter.NEWLINE,
HeaderLine: true,
RowLine: false,
AutoMergeCells: false,
Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
}
}
// FromHTMLNode renders text output from a pre-parsed HTML document.
@ -277,6 +320,25 @@ func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
buf := &bytes.Buffer{}
table := tablewriter.NewWriter(buf)
if ctx.options.PrettyTablesOptions != nil {
options := ctx.options.PrettyTablesOptions
table.SetAutoFormatHeaders(options.AutoFormatHeader)
table.SetAutoWrapText(options.AutoWrapText)
table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap)
table.SetColWidth(options.ColWidth)
table.SetColumnSeparator(options.ColumnSeparator)
table.SetRowSeparator(options.RowSeparator)
table.SetCenterSeparator(options.CenterSeparator)
table.SetHeaderAlignment(options.HeaderAlignment)
table.SetFooterAlignment(options.FooterAlignment)
table.SetAlignment(options.Alignment)
table.SetColumnAlignment(options.ColumnAlignment)
table.SetNewLine(options.NewLine)
table.SetHeaderLine(options.HeaderLine)
table.SetRowLine(options.RowLine)
table.SetAutoMergeCells(options.AutoMergeCells)
table.SetBorders(options.Borders)
}
table.SetHeader(ctx.tableCtx.header)
table.SetFooter(ctx.tableCtx.footer)
table.AppendBulk(ctx.tableCtx.body)
@ -337,7 +399,7 @@ func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
if ctx.isPre {
data = node.Data
} else {
data = strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
}
return ctx.emit(data)

146
vendor/vendor.json vendored
View file

@ -33,34 +33,34 @@
"revisionTime": "2019-01-24T09:32:44Z"
},
{
"checksumSHA1": "vDMP0idNbOYidOmOILZBmFkndH0=",
"checksumSHA1": "0etQIvNU8zcr74rQKJLeH5O+MzU=",
"path": "github.com/mmcdole/gofeed",
"revision": "bef0272b721df659ef4abe302ae89e7182be6692",
"revisionTime": "2019-01-30T02:39:03Z"
"revision": "0e68beaf6fdf215bd1fe42a09f6de292c7032359",
"revisionTime": "2019-04-20T15:49:28Z"
},
{
"checksumSHA1": "Z5g81IePo5p+/qvAn5sAVCkXBgM=",
"checksumSHA1": "+plUZFo1eUTODCmc1ptE2VUuWM4=",
"path": "github.com/mmcdole/gofeed/atom",
"revision": "bef0272b721df659ef4abe302ae89e7182be6692",
"revisionTime": "2019-01-30T02:39:03Z"
"revision": "0e68beaf6fdf215bd1fe42a09f6de292c7032359",
"revisionTime": "2019-04-20T15:49:28Z"
},
{
"checksumSHA1": "30XHSW5SZy4yL+ezmw0xyhGdM+Y=",
"path": "github.com/mmcdole/gofeed/extensions",
"revision": "bef0272b721df659ef4abe302ae89e7182be6692",
"revisionTime": "2019-01-30T02:39:03Z"
"revision": "0e68beaf6fdf215bd1fe42a09f6de292c7032359",
"revisionTime": "2019-04-20T15:49:28Z"
},
{
"checksumSHA1": "NwQLZRUKjy08lASzcVejfIY44NY=",
"checksumSHA1": "Aw0Zm44X0crwxj4KZTYqK2C3zT0=",
"path": "github.com/mmcdole/gofeed/internal/shared",
"revision": "bef0272b721df659ef4abe302ae89e7182be6692",
"revisionTime": "2019-01-30T02:39:03Z"
"revision": "0e68beaf6fdf215bd1fe42a09f6de292c7032359",
"revisionTime": "2019-04-20T15:49:28Z"
},
{
"checksumSHA1": "A4hb/GQtEk5ml8l9u2FYqSwhapE=",
"checksumSHA1": "4Em5hO79HIPXmUYb9s/zh6CuQ0c=",
"path": "github.com/mmcdole/gofeed/rss",
"revision": "bef0272b721df659ef4abe302ae89e7182be6692",
"revisionTime": "2019-01-30T02:39:03Z"
"revision": "0e68beaf6fdf215bd1fe42a09f6de292c7032359",
"revisionTime": "2019-04-20T15:49:28Z"
},
{
"checksumSHA1": "dMr0U8HXo4PGD9qDPxvh6Iizzh0=",
@ -69,10 +69,10 @@
"revisionTime": "2018-10-12T15:49:47Z"
},
{
"checksumSHA1": "iKe8IxLz3b/sYilOm+MW5Q5ZUAc=",
"checksumSHA1": "HZJ2dhzXoMi8n+iY80A9vsnyQUk=",
"path": "github.com/olekukonko/tablewriter",
"revision": "93462a5dfaa69905a8243682186e36764caf8680",
"revisionTime": "2019-02-14T16:47:07Z"
"revision": "1c0837c15a0bac7871496dfce5dcdd308e0a330f",
"revisionTime": "2019-05-08T01:39:46Z"
},
{
"checksumSHA1": "qErubHtC7DAFBnEQkMTuKDtfFTU=",
@ -81,130 +81,130 @@
"revisionTime": "2017-07-18T12:35:48Z"
},
{
"checksumSHA1": "qyU58wqixJNIOwqz2CXE2eMBNTk=",
"checksumSHA1": "bONEZcbkYKiPyABrecOLzHomjPU=",
"path": "golang.org/x/net/html",
"revision": "3a22650c66bd7f4fb6d1e8072ffd7b75c8a27898",
"revisionTime": "2019-02-11T12:35:32Z"
"revision": "f3200d17e092c607f615320ecaad13d87ad9a2b3",
"revisionTime": "2019-05-22T15:39:15Z"
},
{
"checksumSHA1": "XtSbs1gpyaEsIqf6VRhJsgOQe5U=",
"checksumSHA1": "xwhqe/igHQrY3IhqDwzo6j7qpm8=",
"path": "golang.org/x/net/html/atom",
"revision": "3a22650c66bd7f4fb6d1e8072ffd7b75c8a27898",
"revisionTime": "2019-02-11T12:35:32Z"
"revision": "f3200d17e092c607f615320ecaad13d87ad9a2b3",
"revisionTime": "2019-05-22T15:39:15Z"
},
{
"checksumSHA1": "barUU39reQ7LdgYLA323hQ/UGy4=",
"path": "golang.org/x/net/html/charset",
"revision": "3a22650c66bd7f4fb6d1e8072ffd7b75c8a27898",
"revisionTime": "2019-02-11T12:35:32Z"
"revision": "f3200d17e092c607f615320ecaad13d87ad9a2b3",
"revisionTime": "2019-05-22T15:39:15Z"
},
{
"checksumSHA1": "tqqo7DEeFCclb58XbN44WwdpWww=",
"path": "golang.org/x/text/encoding",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "HgcUFTOQF5jOYtTIj5obR3GVN9A=",
"checksumSHA1": "DSdlK4MKI/a3U8Zaee2XKBe01Fo=",
"path": "golang.org/x/text/encoding/charmap",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "UYlVRSWAA5srH3iWvrJz++Zhpr0=",
"checksumSHA1": "SbJkfe5G/5tji96Pa15/ePDOCtk=",
"path": "golang.org/x/text/encoding/htmlindex",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "zeHyHebIZl1tGuwGllIhjfci+wI=",
"path": "golang.org/x/text/encoding/internal",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "oZ2ULELwcY2qgTcpYWrWGsynTuQ=",
"checksumSHA1": "hT7VaIBlkm2YpKulgnjqXNdicGQ=",
"path": "golang.org/x/text/encoding/internal/identifier",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "DhdZROnJq+cEcQ/sHY7GEq5wQ8U=",
"checksumSHA1": "2YqVpmvjWGEBATyUphTP1MS34JE=",
"path": "golang.org/x/text/encoding/japanese",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "qHQ79q9peY8ZkCMC8kJAb52BAWg=",
"checksumSHA1": "+ErWCAdaMwO4PLtrk9D/Hh+7oQM=",
"path": "golang.org/x/text/encoding/korean",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "55UdScb+EMOCPr7OW0hCwDsVxpg=",
"checksumSHA1": "mTuZi5urYwgDIO8+Gfql2pv8Vwg=",
"path": "golang.org/x/text/encoding/simplifiedchinese",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "9EZF1SHTpjVmaT9sARitvGKUXOY=",
"checksumSHA1": "D+VI4j0Wjzr8SeupWdOB5KBdFOw=",
"path": "golang.org/x/text/encoding/traditionalchinese",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "bAJTZJ3IGJdNmN/PSlRMRxWtxec=",
"path": "golang.org/x/text/encoding/unicode",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "8ea1h1pimPfXc6cE5l3SQTe7SVo=",
"checksumSHA1": "ybE4kAPmNPV/dvShuG86AmLbhdE=",
"path": "golang.org/x/text/internal/language",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "GxBlFOqWoIsWCMswUHh6dUqM5no=",
"checksumSHA1": "VDwNSsZP6KShjTSwGUQUGJVrs1I=",
"path": "golang.org/x/text/internal/language/compact",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "hyNCcTwMQnV6/MK8uUW9E5H0J0M=",
"path": "golang.org/x/text/internal/tag",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "Qk7dljcrEK1BJkAEZguxAbG9dSo=",
"path": "golang.org/x/text/internal/utf8internal",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "JsDbO/lVIuCNOJfWQ3XESyrR73Q=",
"checksumSHA1": "oYNlkS+0TimKOScUz3Hn9QWyz6w=",
"path": "golang.org/x/text/language",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "IV4MN7KGBSocu/5NR3le3sxup4Y=",
"path": "golang.org/x/text/runes",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "o3YChxWLvyCmkAn/ZNBj9HC9zKw=",
"checksumSHA1": "R9iBDY+aPnT+8pyRcqGjXq5QixA=",
"path": "golang.org/x/text/transform",
"revision": "6c92c7dc7f53607809182301b96e4cc1975143f1",
"revisionTime": "2018-12-15T14:04:16Z"
"revision": "342b2e1fbaa52c93f31447ad2c6abc048c63e475",
"revisionTime": "2018-12-15T17:52:45Z"
},
{
"checksumSHA1": "4suIsXtE9OeWsKZRT7KsgI5mjfw=",
"checksumSHA1": "2/9hMw7Y4I42L/PTobKqVldWUAU=",
"path": "jaytaylor.com/html2text",
"revision": "57d518f124b0cf46ea2021f25a01396b3522e6fb",
"revisionTime": "2018-06-06T19:48:05Z"
"revision": "01ec452cbe43774f989516272881441cae40c16b",
"revisionTime": "2019-03-26T19:55:09Z"
}
],
"rootPath": "salsa.debian.org/mdosch-guest/feed-to-muc"