feed-to-muc/getArticles.go

312 lines
8.4 KiB
Go
Raw Normal View History

/* Copyright Martin Dosch
2018-07-13 22:53:22 +02:00
Licensed under the "MIT License" */
package main
import (
"encoding/json"
"hash/fnv"
"log"
"os"
"os/user"
2018-07-13 22:53:22 +02:00
"strconv"
"strings"
"time"
2022-12-16 11:57:42 +01:00
"github.com/jaytaylor/html2text"
2023-04-06 17:51:14 +02:00
"github.com/mmcdole/gofeed"
2018-07-13 22:53:22 +02:00
)
2018-08-02 09:41:21 +02:00
// Get new articles for specified feed.
func getArticles(feedURL string, max int, noExcerpt bool, filter []string, filterMessage []string) (string, error) {
2018-07-13 22:53:22 +02:00
type feedCache struct {
LastChange string
}
var output, cachePath string
2018-07-13 22:53:22 +02:00
var last time.Time
var lastUpdate feedCache
var file *os.File
var updateTime time.Time
// Get systems user cache path.
osCacheDir := os.Getenv("$XDG_CACHE_HOME")
if osCacheDir != "" {
// Create configPath if not yet existing.
cachePath = osCacheDir + "/feed-to-muc/"
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
2023-09-30 20:42:07 +02:00
err = os.MkdirAll(cachePath, 0o700)
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't create cache path:", err)
}
}
} else { // Get the current user.
curUser, err := user.Current()
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't get current user:", err)
return "", err
}
// Get home directory.
home := curUser.HomeDir
if home == "" {
log.Fatal("Error: No home directory available.")
return "", err
}
// Create cachePath if not yet existing.
cachePath = home + "/.cache/feed-to-muc/"
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
2023-09-30 20:42:07 +02:00
err = os.MkdirAll(cachePath, 0o700)
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't create cache path:", err)
}
}
}
2018-07-13 22:53:22 +02:00
// Create a hash as identifier for the feed.
// The identifier will be used as filename for caching the update time.
h := fnv.New32a()
_, err := h.Write([]byte(feedURL))
if err != nil {
2019-08-30 12:39:55 +02:00
log.Fatal("Error: Can't create hash for", feedURL+":", err)
}
2018-07-13 22:53:22 +02:00
if _, err := os.Stat(cachePath); os.IsNotExist(err) {
2023-09-30 20:42:07 +02:00
err = os.MkdirAll(cachePath, 0o700)
2018-07-13 22:53:22 +02:00
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't create hash identifier for cache file:", err)
2018-07-13 22:53:22 +02:00
}
}
cacheFile := cachePath + strconv.Itoa(int(h.Sum32()))
if _, err := os.Stat(cacheFile); os.IsNotExist(err) {
file, err = os.Create(cacheFile)
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't create cache file:", err)
2018-07-13 22:53:22 +02:00
}
last = time.Now()
lastUpdate.LastChange = last.Format(time.RFC3339)
lastUpdateJSON, _ := json.MarshalIndent(lastUpdate, "", " ")
_, err = file.Write(lastUpdateJSON)
if err != nil {
2024-10-05 15:20:57 +02:00
file.Close()
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't write last update time stamp to cache file:", err)
2018-07-13 22:53:22 +02:00
}
} else {
2023-09-30 20:42:07 +02:00
file, err = os.OpenFile(cacheFile, os.O_RDWR, 0o600)
2018-07-13 22:53:22 +02:00
if err != nil {
2024-10-05 15:20:57 +02:00
file.Close()
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't open cache file:", err)
2018-07-13 22:53:22 +02:00
}
decoder := json.NewDecoder(file)
lastUpdate := feedCache{}
if err := decoder.Decode(&lastUpdate); err != nil {
2024-10-05 15:20:57 +02:00
file.Close()
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't decode laste updates time stamp:", err)
2018-07-13 22:53:22 +02:00
}
last, err = time.Parse(time.RFC3339, string(lastUpdate.LastChange))
if err != nil {
2024-10-05 15:20:57 +02:00
file.Close()
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't parse last updates time stamp:", err)
2018-07-13 22:53:22 +02:00
}
}
2024-10-05 15:20:57 +02:00
file.Close()
2018-07-13 22:53:22 +02:00
fp := gofeed.NewParser()
feed, err := fp.ParseURL(feedURL)
if err != nil {
// Don't return an error, but log a message as the
// bot should not crash when the feed is not available.
log.Println(feedURL, ": Feed not available.")
return "", nil
2018-07-13 22:53:22 +02:00
}
// If no publish date is offered try update date.
// If both is not offered give up.
if feed.Items[0].PublishedParsed == nil {
if feed.Items[0].UpdatedParsed == nil {
return "", err
}
// If cached timestamp is newer than the one of
// the last article return.
if last.After(*feed.Items[0].UpdatedParsed) {
return "", err
}
2024-10-05 15:20:57 +02:00
} else if last.After(*feed.Items[0].PublishedParsed) {
2018-07-13 22:53:22 +02:00
// If cached timestamp is newer than the one of
// the last article return.
2024-10-05 15:20:57 +02:00
return "", err
2018-07-13 22:53:22 +02:00
}
// Check last n (defined in config) articles for new ones.
for i := max - 1; i >= 0; i-- {
// Stop processing for article i if there are not so
// many articles in the feed.
if len(feed.Items) < i+1 {
continue
}
article := *feed.Items[i]
if err != nil {
return "", err
}
if article.PublishedParsed == nil {
updateTime = *article.UpdatedParsed
} else {
updateTime = *article.PublishedParsed
}
// If cached timestamp is not older than the article stop processing.
// Note: Checking for cached timestamp being newer, instead of not older
// lead to duplicate messages for the same article. Probably a corner
// case when the time is identical.
if !last.Before(updateTime) {
2018-07-13 22:53:22 +02:00
continue
}
last = updateTime
lastUpdate.LastChange = updateTime.Format(time.RFC3339)
2018-07-13 22:53:22 +02:00
// Remove file with cached timestamp and create it
// again with updated timestamp.
// ToDo: Replace timestamp without deleting.
err = os.Remove(cacheFile)
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't delete cache file:", err)
}
2018-07-13 22:53:22 +02:00
file, err = os.Create(cacheFile)
if err != nil {
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't create cache file:", err)
}
lastUpdateJSON, _ := json.MarshalIndent(lastUpdate, "", " ")
_, err = file.Write(lastUpdateJSON)
if err != nil {
2024-10-05 15:20:57 +02:00
file.Close()
2019-05-31 09:06:36 +02:00
log.Fatal("Error: Can't write last update time stamp to cache file:", err)
2018-07-13 22:53:22 +02:00
}
2024-10-05 15:20:57 +02:00
file.Close()
2018-07-13 22:53:22 +02:00
// Remove redirects and tracking parameters from URL.
cleanURL, _ := removeTracking(article.Link)
// Only send title and link if option noExcerpt is set,
// otherwise add the description.
if noExcerpt {
// Stop processing the title if it contains the string
// configured in FilterMessage
filterStrike := false
for _, filterString := range filterMessage {
if strings.Contains(feed.Title, filterString) {
filterStrike = true
}
}
if filterStrike {
continue
}
output = output + feed.Title + ": *" + article.Title + "*\n" +
cleanURL
2018-07-13 22:53:22 +02:00
} else {
var description string
// Some feeds don't provide a description, let's use the content
// in this case, otherwise use the shorter description.
if article.Description != "" {
// Strip HTML as we want to get plain text.
description, err = html2text.FromString(article.Description)
if err != nil {
return "", err
}
} else {
// Strip HTML as we want to get plain text.
description, err = html2text.FromString(article.Content)
if err != nil {
return "", err
}
}
// Stop processing the article if it contains the string
// configured in FilterMessage
filterStrike := false
for _, filterString := range filterMessage {
2023-09-30 20:43:55 +02:00
if strings.Contains(description, filterString) ||
strings.Contains(feed.Title, filterString) {
filterStrike = true
}
}
if filterStrike {
continue
}
// Remove lines only consisting of "> "; thank you reddit.
2024-10-05 15:20:57 +02:00
description = strings.ReplaceAll(description, "> \n", "")
2019-05-31 09:50:46 +02:00
// Split article description/content in single lines.
lines := strings.Split(description, "\n")
// Empty article description/content.
description = ""
2019-05-31 09:50:46 +02:00
// Get amount of lines in description/content.
descriptionLength := len(lines)
for i, line := range lines {
// Remove empty lines to safe space.
if line != "" {
2019-05-31 09:50:46 +02:00
// Remove lines starting with one of the defined filters.
filterStrike := false
for _, filterString := range filter {
if strings.HasPrefix(line, filterString) {
2019-05-31 09:50:46 +02:00
filterStrike = true
}
}
if !filterStrike {
2024-10-05 15:20:57 +02:00
description += line
2019-05-31 09:50:46 +02:00
// Add new line, except for the last line.
if i < descriptionLength-1 {
2024-10-05 15:20:57 +02:00
description += "\n"
2019-05-31 09:50:46 +02:00
}
}
}
}
2019-05-31 09:50:46 +02:00
// To make the message look not so bloated we remove double newlines.
// Split the article description/content into fragments between double newlines.
// fragments := strings.Split(description, "\n\n")
// Empty article description/content
// description = ""
// Fill article description/content with the fragments separated by one newline.
// for _, line := range fragments {
// Only if the only content is not empty.
// if line != "" {
// description = description + line + "\n"
// }
//}
// Only append article link if it is not yet contained in description (e.g. read more: URL).
if strings.Contains(description, article.Link) {
// Replace article link with URL cleaned from redirects and trackers.
2024-10-05 15:20:57 +02:00
description = strings.ReplaceAll(description, article.Link, cleanURL)
output = output + feed.Title + ": *" + article.Title + "*\n\n" + description
} else {
output = output + feed.Title + ": *" + article.Title + "*\n\n" + description + "\n" + cleanURL
}
2018-07-13 22:53:22 +02:00
}
if i > 0 {
2024-10-05 15:20:57 +02:00
output += "\n\n---\n\n"
2018-07-13 22:53:22 +02:00
}
}
return output, err
}