feed-to-muc/vendor/github.com/mmcdole/gofeed/internal/shared/xmlbase.go

package shared

import (
	"bytes"
	"fmt"
	"golang.org/x/net/html"
	"net/url"
	"strings"

	"github.com/mmcdole/goxpp"
)

var (
	// HTML attributes which contain URIs
	// https://pythonhosted.org/feedparser/resolving-relative-links.html
	// To catch every possible URI attribute is non-trivial:
	// https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
	htmlURIAttrs = map[string]bool{
		"action":     true,
		"background": true,
		"cite":       true,
		"codebase":   true,
		"data":       true,
		"href":       true,
		"poster":     true,
		"profile":    true,
		"scheme":     true,
		"src":        true,
		"uri":        true,
		"usemap":     true,
	}
)

type urlStack []*url.URL

func (s *urlStack) push(u *url.URL) {
	*s = append([]*url.URL{u}, *s...)
}

func (s *urlStack) pop() *url.URL {
	if s == nil || len(*s) == 0 {
		return nil
	}
	var top *url.URL
	top, *s = (*s)[0], (*s)[1:]
	return top
}

func (s *urlStack) top() *url.URL {
	if s == nil || len(*s) == 0 {
		return nil
	}
	return (*s)[0]
}

type XMLBase struct {
	stack    urlStack
	URIAttrs map[string]bool
}

// FindRoot iterates through the tokens of an xml document until
// it encounters its first StartTag event.  It returns an error
// if it reaches EndDocument before finding a tag.
func (b *XMLBase) FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
	for {
		event, err = b.NextTag(p)
		if err != nil {
			return event, err
		}
		if event == xpp.StartTag {
			break
		}

		if event == xpp.EndDocument {
			return event, fmt.Errorf("Failed to find root node before document end.")
		}
	}
	return
}

// XMLBase.NextTag iterates through the tokens until it reaches a StartTag or
// EndTag It maintains the urlStack upon encountering StartTag and EndTags, so
// that the top of the stack (accessible through the CurrentBase() and
// CurrentBaseURL() methods) is the absolute base URI by which relative URIs
// should be resolved.
//
// NextTag is similar to goxpp's NextTag method except it wont throw an error
// if the next immediate token isnt a Start/EndTag.  Instead, it will continue
// to consume tokens until it hits a Start/EndTag or EndDocument.
func (b *XMLBase) NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
	for {

		if p.Event == xpp.EndTag {
			// Pop xml:base after each end tag
			b.pop()
		}

		event, err = p.Next()
		if err != nil {
			return event, err
		}

		if event == xpp.EndTag {
			break
		}

		if event == xpp.StartTag {
			base := parseBase(p)
			err = b.push(base)
			if err != nil {
				return
			}

			err = b.resolveAttrs(p)
			if err != nil {
				return
			}

			break
		}

		if event == xpp.EndDocument {
			return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
		}

	}
	return
}

func parseBase(p *xpp.XMLPullParser) string {
	xmlURI := "http://www.w3.org/XML/1998/namespace"
	for _, attr := range p.Attrs {
		if attr.Name.Local == "base" && attr.Name.Space == xmlURI {
			return attr.Value
		}
	}
	return ""
}

func (b *XMLBase) push(base string) error {
	newURL, err := url.Parse(base)
	if err != nil {
		return err
	}

	topURL := b.CurrentBaseURL()
	if topURL != nil {
		newURL = topURL.ResolveReference(newURL)
	}
	b.stack.push(newURL)
	return nil
}

// returns the popped base URL
func (b *XMLBase) pop() string {
	url := b.stack.pop()
	if url != nil {
		return url.String()
	}
	return ""
}

func (b *XMLBase) CurrentBaseURL() *url.URL {
	return b.stack.top()
}

func (b *XMLBase) CurrentBase() string {
	if url := b.CurrentBaseURL(); url != nil {
		return url.String()
	}
	return ""
}

// resolve the given string as a URL relative to current base
func (b *XMLBase) ResolveURL(u string) (string, error) {
	if b.CurrentBase() == "" {
		return u, nil
	}

	relURL, err := url.Parse(u)
	if err != nil {
		return u, err
	}
	curr := b.CurrentBaseURL()
	if curr.Path != "" && u != "" && curr.Path[len(curr.Path)-1] != '/' {
		// There's no reason someone would use a path in xml:base if they
		// didn't mean for it to be a directory
		curr.Path = curr.Path + "/"
	}
	absURL := b.CurrentBaseURL().ResolveReference(relURL)
	return absURL.String(), nil
}

// resolve relative URI attributes according to xml:base
func (b *XMLBase) resolveAttrs(p *xpp.XMLPullParser) error {
	for i, attr := range p.Attrs {
		lowerName := strings.ToLower(attr.Name.Local)
		if b.URIAttrs[lowerName] {
			absURL, err := b.ResolveURL(attr.Value)
			if err != nil {
				return err
			}
			p.Attrs[i].Value = absURL
		}
	}
	return nil
}

// Transforms html by resolving any relative URIs in attributes
// if an error occurs during parsing or serialization, then the original string
// is returned along with the error.
func (b *XMLBase) ResolveHTML(relHTML string) (string, error) {
	if b.CurrentBase() == "" {
		return relHTML, nil
	}

	htmlReader := strings.NewReader(relHTML)

	doc, err := html.Parse(htmlReader)
	if err != nil {
		return relHTML, err
	}

	var visit func(*html.Node)

	// recursively traverse HTML resolving any relative URIs in attributes
	visit = func(n *html.Node) {
		if n.Type == html.ElementNode {
			for i, a := range n.Attr {
				if htmlURIAttrs[a.Key] {
					absVal, err := b.ResolveURL(a.Val)
					if err == nil {
						n.Attr[i].Val = absVal
					}
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			visit(c)
		}
	}

	visit(doc)
	var w bytes.Buffer
	err = html.Render(&w, doc)
	if err != nil {
		return relHTML, err
	}

	// html.Render() always writes a complete html5 document, so strip the html
	// and body tags
	absHTML := w.String()
	absHTML = strings.TrimPrefix(absHTML, "<html><head></head><body>")
	absHTML = strings.TrimSuffix(absHTML, "</body></html>")

	return absHTML, err
}
Vendored external dependencies. 2019-02-20 20:23:48 +01:00			`package shared`

			`import (`
			`"bytes"`
			`"fmt"`
			`"golang.org/x/net/html"`
			`"net/url"`
			`"strings"`

			`"github.com/mmcdole/goxpp"`
			`)`

			`var (`
			`// HTML attributes which contain URIs`
			`// https://pythonhosted.org/feedparser/resolving-relative-links.html`
			`// To catch every possible URI attribute is non-trivial:`
			`// https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value`
			`htmlURIAttrs = map[string]bool{`
			`"action": true,`
			`"background": true,`
			`"cite": true,`
			`"codebase": true,`
			`"data": true,`
			`"href": true,`
			`"poster": true,`
			`"profile": true,`
			`"scheme": true,`
			`"src": true,`
			`"uri": true,`
			`"usemap": true,`
			`}`
			`)`

			`type urlStack []*url.URL`

			`func (s urlStack) push(u url.URL) {`
			`s = append([]url.URL{u}, *s...)`
			`}`

			`func (s urlStack) pop() url.URL {`
			`if s == nil \|\| len(*s) == 0 {`
			`return nil`
			`}`
			`var top *url.URL`
			`top, s = (s)[0], (*s)[1:]`
			`return top`
			`}`

			`func (s urlStack) top() url.URL {`
			`if s == nil \|\| len(*s) == 0 {`
			`return nil`
			`}`
			`return (*s)[0]`
			`}`

			`type XMLBase struct {`
			`stack urlStack`
			`URIAttrs map[string]bool`
			`}`

			`// FindRoot iterates through the tokens of an xml document until`
			`// it encounters its first StartTag event. It returns an error`
			`// if it reaches EndDocument before finding a tag.`
			`func (b XMLBase) FindRoot(p xpp.XMLPullParser) (event xpp.XMLEventType, err error) {`
			`for {`
			`event, err = b.NextTag(p)`
			`if err != nil {`
			`return event, err`
			`}`
			`if event == xpp.StartTag {`
			`break`
			`}`

			`if event == xpp.EndDocument {`
			`return event, fmt.Errorf("Failed to find root node before document end.")`
			`}`
			`}`
			`return`
			`}`

			`// XMLBase.NextTag iterates through the tokens until it reaches a StartTag or`
			`// EndTag It maintains the urlStack upon encountering StartTag and EndTags, so`
			`// that the top of the stack (accessible through the CurrentBase() and`
			`// CurrentBaseURL() methods) is the absolute base URI by which relative URIs`
			`// should be resolved.`
			`//`
			`// NextTag is similar to goxpp's NextTag method except it wont throw an error`
			`// if the next immediate token isnt a Start/EndTag. Instead, it will continue`
			`// to consume tokens until it hits a Start/EndTag or EndDocument.`
			`func (b XMLBase) NextTag(p xpp.XMLPullParser) (event xpp.XMLEventType, err error) {`
			`for {`

			`if p.Event == xpp.EndTag {`
			`// Pop xml:base after each end tag`
			`b.pop()`
			`}`

			`event, err = p.Next()`
			`if err != nil {`
			`return event, err`
			`}`

			`if event == xpp.EndTag {`
			`break`
			`}`

			`if event == xpp.StartTag {`
			`base := parseBase(p)`
			`err = b.push(base)`
			`if err != nil {`
			`return`
			`}`

			`err = b.resolveAttrs(p)`
			`if err != nil {`
			`return`
			`}`

			`break`
			`}`

			`if event == xpp.EndDocument {`
			`return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")`
			`}`

			`}`
			`return`
			`}`

			`func parseBase(p *xpp.XMLPullParser) string {`
			`xmlURI := "http://www.w3.org/XML/1998/namespace"`
			`for _, attr := range p.Attrs {`
			`if attr.Name.Local == "base" && attr.Name.Space == xmlURI {`
			`return attr.Value`
			`}`
			`}`
			`return ""`
			`}`

			`func (b *XMLBase) push(base string) error {`
			`newURL, err := url.Parse(base)`
			`if err != nil {`
			`return err`
			`}`

			`topURL := b.CurrentBaseURL()`
			`if topURL != nil {`
			`newURL = topURL.ResolveReference(newURL)`
			`}`
			`b.stack.push(newURL)`
			`return nil`
			`}`

			`// returns the popped base URL`
			`func (b *XMLBase) pop() string {`
			`url := b.stack.pop()`
			`if url != nil {`
			`return url.String()`
			`}`
			`return ""`
			`}`

			`func (b XMLBase) CurrentBaseURL() url.URL {`
			`return b.stack.top()`
			`}`

			`func (b *XMLBase) CurrentBase() string {`
			`if url := b.CurrentBaseURL(); url != nil {`
			`return url.String()`
			`}`
			`return ""`
			`}`

			`// resolve the given string as a URL relative to current base`
			`func (b *XMLBase) ResolveURL(u string) (string, error) {`
			`if b.CurrentBase() == "" {`
			`return u, nil`
			`}`

			`relURL, err := url.Parse(u)`
			`if err != nil {`
			`return u, err`
			`}`
			`curr := b.CurrentBaseURL()`
			`if curr.Path != "" && u != "" && curr.Path[len(curr.Path)-1] != '/' {`
			`// There's no reason someone would use a path in xml:base if they`
			`// didn't mean for it to be a directory`
			`curr.Path = curr.Path + "/"`
			`}`
			`absURL := b.CurrentBaseURL().ResolveReference(relURL)`
			`return absURL.String(), nil`
			`}`

			`// resolve relative URI attributes according to xml:base`
			`func (b XMLBase) resolveAttrs(p xpp.XMLPullParser) error {`
			`for i, attr := range p.Attrs {`
			`lowerName := strings.ToLower(attr.Name.Local)`
			`if b.URIAttrs[lowerName] {`
			`absURL, err := b.ResolveURL(attr.Value)`
			`if err != nil {`
			`return err`
			`}`
			`p.Attrs[i].Value = absURL`
			`}`
			`}`
			`return nil`
			`}`

			`// Transforms html by resolving any relative URIs in attributes`
			`// if an error occurs during parsing or serialization, then the original string`
			`// is returned along with the error.`
			`func (b *XMLBase) ResolveHTML(relHTML string) (string, error) {`
			`if b.CurrentBase() == "" {`
			`return relHTML, nil`
			`}`

			`htmlReader := strings.NewReader(relHTML)`

			`doc, err := html.Parse(htmlReader)`
			`if err != nil {`
			`return relHTML, err`
			`}`

			`var visit func(*html.Node)`

			`// recursively traverse HTML resolving any relative URIs in attributes`
			`visit = func(n *html.Node) {`
			`if n.Type == html.ElementNode {`
			`for i, a := range n.Attr {`
			`if htmlURIAttrs[a.Key] {`
			`absVal, err := b.ResolveURL(a.Val)`
			`if err == nil {`
			`n.Attr[i].Val = absVal`
			`}`
			`break`
			`}`
			`}`
			`}`
			`for c := n.FirstChild; c != nil; c = c.NextSibling {`
			`visit(c)`
			`}`
			`}`

			`visit(doc)`
			`var w bytes.Buffer`
			`err = html.Render(&w, doc)`
			`if err != nil {`
			`return relHTML, err`
			`}`

			`// html.Render() always writes a complete html5 document, so strip the html`
			`// and body tags`
			`absHTML := w.String()`
			`absHTML = strings.TrimPrefix(absHTML, "<html><head></head><body>")`
			`absHTML = strings.TrimSuffix(absHTML, "</body></html>")`

			`return absHTML, err`
			`}`