feed-to-muc/getarticles.go
2021-12-13 10:30:00 +01:00

228 lines
6.1 KiB
Go

/* Copyright 2018 Martin Dosch
Licensed under the "MIT License" */
package main
import (
"encoding/json"
"hash/fnv"
"log"
"os"
"os/user"
"strconv"
"strings"
"time"
"github.com/jaytaylor/html2text"
"github.com/mmcdole/gofeed"
)
// Get new articles for specified feed.
func getArticles(feedURL string, max int, noExcerpt bool) (string, error) {
type feedCache struct {
LastChange string
}
var output, cachePath string
var last time.Time
var lastUpdate feedCache
var file *os.File
var updateTime time.Time
// Get systems user cache path.
osCacheDir := os.Getenv("$XDG_CACHE_HOME")
if osCacheDir != "" {
// Create configPath if not yet existing.
cachePath = osCacheDir + "/feed-to-muc/"
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
err = os.MkdirAll(cachePath, 0700)
if err != nil {
log.Fatal("Error: Can't create cache path:", err)
}
}
} else { // Get the current user.
curUser, err := user.Current()
if err != nil {
log.Fatal("Error: Can't get current user:", err)
return "", err
}
// Get home directory.
home := curUser.HomeDir
if home == "" {
log.Fatal("Error: No home directory available.")
return "", err
}
// Create cachePath if not yet existing.
cachePath = home + "/.cache/feed-to-muc/"
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
err = os.MkdirAll(cachePath, 0700)
if err != nil {
log.Fatal("Error: Can't create cache path:", err)
}
}
}
// Create a hash as identifier for the feed.
// The identifier will be used as filename for caching the update time.
h := fnv.New32a()
_, err := h.Write([]byte(feedURL))
if err != nil {
log.Fatal("Error: Can't create hash for", feedURL+":", err)
}
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
err = os.MkdirAll(cachePath, 0700)
if err != nil {
log.Fatal("Error: Can't create hash identifier for cache file:", err)
}
}
cacheFile := cachePath + strconv.Itoa(int(h.Sum32()))
if _, err := os.Stat(cacheFile); os.IsNotExist(err) {
file, err = os.Create(cacheFile)
if err != nil {
log.Fatal("Error: Can't create cache file:", err)
}
defer file.Close()
last = time.Now()
lastUpdate.LastChange = last.Format(time.RFC3339)
lastUpdateJSON, _ := json.MarshalIndent(lastUpdate, "", " ")
_, err = file.Write(lastUpdateJSON)
if err != nil {
log.Fatal("Error: Can't write last update time stamp to cache file:", err)
}
} else {
file, err = os.OpenFile(cacheFile, os.O_RDWR, 0600)
if err != nil {
log.Fatal("Error: Can't open cache file:", err)
}
defer file.Close()
decoder := json.NewDecoder(file)
lastUpdate := feedCache{}
if err := decoder.Decode(&lastUpdate); err != nil {
log.Fatal("Error: Can't decode laste updates time stamp:", err)
}
last, err = time.Parse(time.RFC3339, string(lastUpdate.LastChange))
if err != nil {
log.Fatal("Error: Can't parse last updates time stamp:", err)
}
}
fp := gofeed.NewParser()
feed, err := fp.ParseURL(feedURL)
if err != nil {
// Don't return an error, but log a message as the
// bot should not crash when the feed is not available.
log.Println(feedURL, ": Feed not available.")
return "", nil
}
// If no publish date is offered try update date.
// If both is not offered give up.
if feed.Items[0].PublishedParsed == nil {
if feed.Items[0].UpdatedParsed == nil {
return "", err
}
// If cached timestamp is newer than the one of
// the last article return.
if last.After(*feed.Items[0].UpdatedParsed) {
return "", err
}
} else {
// If cached timestamp is newer than the one of
// the last article return.
if last.After(*feed.Items[0].PublishedParsed) {
return "", err
}
}
// Check last n (defined in config) articles for new ones.
for i := max - 1; i >= 0; i-- {
// Stop processing for article i if there are not so
// many articles in the feed.
if len(feed.Items) < i+1 {
continue
}
article := *feed.Items[i]
if err != nil {
return "", err
}
if article.PublishedParsed == nil {
updateTime = *article.UpdatedParsed
} else {
updateTime = *article.PublishedParsed
}
// If cached timestamp is not older than the article stop processing.
// Note: Checking for cached timestamp being newer, instead of not older
// lead to duplicate messages for the same article. Probably a corner
// case when the time is identical.
if !last.Before(updateTime) {
continue
}
last = updateTime
lastUpdate.LastChange = updateTime.Format(time.RFC3339)
// Remove file with cached timestamp and create it
// again with updated timestamp.
// ToDo: Replace timestamp without deleting.
err = os.Remove(cacheFile)
if err != nil {
log.Fatal("Error: Can't delete cache file:", err)
}
file, err = os.Create(cacheFile)
if err != nil {
log.Fatal("Error: Can't create cache file:", err)
}
defer file.Close()
lastUpdateJSON, _ := json.MarshalIndent(lastUpdate, "", " ")
_, err = file.Write(lastUpdateJSON)
if err != nil {
log.Fatal("Error: Can't write last update time stamp to cache file:", err)
}
// Remove redirects and tracking parameters from URL.
cleanURL, _ := removeTracking(article.Link)
// Don't process mastodon messages that are a reply
// Strip HTML as we want to get plain text.
mastodonContent := strings.Replace(article.Description, `</span><span class="ellipsis">`, "", -1)
mastodonContent = strings.Replace(mastodonContent, `</span><span class="invisible">`, "", -1)
mastodonContent = strings.Replace(mastodonContent, `</span><span class="">`, "", -1)
mastodonContent = strings.Replace(mastodonContent, `</span>`, "", -1)
mastodonContent = strings.Replace(mastodonContent, `<span>`, "", -1)
mastodonContent, err = html2text.FromString(mastodonContent, html2text.Options{OmitLinks: true, TextOnly: true})
if err != nil {
return "", err
}
// If the content is empty after html2text (e.g. consisting of an image only)
// then stop processing
if strings.Replace(mastodonContent, " ", "", -1) == "" {
continue
}
output = output + mastodonContent + "\n\n" + cleanURL
if i > 0 {
output = output + "\n\n---\n\n"
}
}
return output, err
}