feed-to-muc/vendor/github.com/mmcdole/gofeed/internal/shared/parseutils.go

164 lines
3.8 KiB
Go
Raw Normal View History

2019-02-20 20:23:48 +01:00
package shared
import (
"bytes"
"errors"
2020-10-24 12:37:32 +02:00
"html"
2019-02-20 20:23:48 +01:00
"regexp"
"strings"
2020-05-14 16:07:09 +02:00
xpp "github.com/mmcdole/goxpp"
2019-02-20 20:23:48 +01:00
)
var (
emailNameRgx = regexp.MustCompile(`^([^@]+@[^\s]+)\s+\(([^@]+)\)$`)
nameEmailRgx = regexp.MustCompile(`^([^@]+)\s+\(([^@]+@[^)]+)\)$`)
nameOnlyRgx = regexp.MustCompile(`^([^@()]+)$`)
emailOnlyRgx = regexp.MustCompile(`^([^@()]+@[^@()]+)$`)
TruncatedEntity = errors.New("truncated entity")
InvalidNumericReference = errors.New("invalid numeric reference")
)
2020-05-14 16:07:09 +02:00
const CDATA_START = "<![CDATA["
const CDATA_END = "]]>"
2019-05-31 09:10:48 +02:00
2019-02-20 20:23:48 +01:00
// ParseText is a helper function for parsing the text
// from the current element of the XMLPullParser.
// This function can handle parsing naked XML text from
// an element.
func ParseText(p *xpp.XMLPullParser) (string, error) {
var text struct {
Type string `xml:"type,attr"`
InnerXML string `xml:",innerxml"`
}
err := p.DecodeElement(&text)
if err != nil {
return "", err
}
result := text.InnerXML
result = strings.TrimSpace(result)
2020-05-14 16:07:09 +02:00
if strings.Contains(result, CDATA_START) {
return StripCDATA(result), nil
2019-02-20 20:23:48 +01:00
}
return DecodeEntities(result)
}
2020-05-14 16:07:09 +02:00
// StripCDATA removes CDATA tags from the string
// content outside of CDATA tags is passed via DecodeEntities
func StripCDATA(str string) string {
buf := bytes.NewBuffer([]byte{})
curr := 0
for curr < len(str) {
start := indexAt(str, CDATA_START, curr)
if start == -1 {
dec, _ := DecodeEntities(str[curr:])
buf.Write([]byte(dec))
return buf.String()
}
end := indexAt(str, CDATA_END, start)
if end == -1 {
dec, _ := DecodeEntities(str[curr:])
buf.Write([]byte(dec))
return buf.String()
}
buf.Write([]byte(str[start+len(CDATA_START) : end]))
curr = curr + end + len(CDATA_END)
}
return buf.String()
}
2019-02-20 20:23:48 +01:00
// DecodeEntities decodes escaped XML entities
// in a string and returns the unescaped string
func DecodeEntities(str string) (string, error) {
data := []byte(str)
buf := bytes.NewBuffer([]byte{})
for len(data) > 0 {
// Find the next entity
idx := bytes.IndexByte(data, '&')
if idx == -1 {
buf.Write(data)
break
}
buf.Write(data[:idx])
2020-05-14 16:07:09 +02:00
data = data[idx:]
2019-02-20 20:23:48 +01:00
2020-05-14 16:07:09 +02:00
// If there is only the '&' left here
if len(data) == 1 {
buf.Write(data)
return buf.String(), nil
2019-02-20 20:23:48 +01:00
}
// Find the end of the entity
end := bytes.IndexByte(data, ';')
if end == -1 {
2020-05-14 16:07:09 +02:00
// it's not an entitiy. just a plain old '&' possibly with extra bytes
buf.Write(data)
return buf.String(), nil
2019-02-20 20:23:48 +01:00
}
2020-05-14 16:07:09 +02:00
// Check if there is a space somewhere within the 'entitiy'.
// If there is then skip the whole thing since it's not a real entity.
if strings.Contains(string(data[1:end]), " ") {
buf.Write(data)
return buf.String(), nil
2019-02-20 20:23:48 +01:00
} else {
2020-10-24 12:37:32 +02:00
buf.WriteString(html.UnescapeString(string(data[0 : end+1])))
2019-02-20 20:23:48 +01:00
}
// Skip the entity
data = data[end+1:]
}
return buf.String(), nil
}
// ParseNameAddress parses name/email strings commonly
// found in RSS feeds of the format "Example Name (example@site.com)"
// and other variations of this format.
func ParseNameAddress(nameAddressText string) (name string, address string) {
if nameAddressText == "" {
return
}
if emailNameRgx.MatchString(nameAddressText) {
result := emailNameRgx.FindStringSubmatch(nameAddressText)
address = result[1]
name = result[2]
} else if nameEmailRgx.MatchString(nameAddressText) {
result := nameEmailRgx.FindStringSubmatch(nameAddressText)
name = result[1]
address = result[2]
} else if nameOnlyRgx.MatchString(nameAddressText) {
result := nameOnlyRgx.FindStringSubmatch(nameAddressText)
name = result[1]
} else if emailOnlyRgx.MatchString(nameAddressText) {
result := emailOnlyRgx.FindStringSubmatch(nameAddressText)
address = result[1]
}
return
}
2020-05-14 16:07:09 +02:00
func indexAt(str, substr string, start int) int {
idx := strings.Index(str[start:], substr)
if idx > -1 {
idx += start
}
return idx
}