blog: Watch website changes with RSS

I use the Go program below to get notifications in my RSS reader when websites change that don’t offer RSS feeds themselves. For each website you would create a new command in main(), choose a shortname, enter the URL and enter a HTML node selector for the part you are interested in (thus also excluding surrounding stuff that might be dynamically created on each visit). You would then call this program with “go run webwatcher SHORTNAME” in your RSS reader.

package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	. "git.fireandbrimst.one/aw/goutil/html"
	"git.fireandbrimst.one/aw/goutil/misc"
	xnetHtml "golang.org/x/net/html"
	"io/ioutil"
	"os"
	"path"
	"text/template"
	"time"
)

const (
	DL_LIMIT     = 15 * 1024 * 1024
	CACHE_FOLDER = "cache"
)

const RSS_TEMPLATE string = `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title><![CDATA[ {{.Shortname}} ]]></title>
<link><![CDATA[ {{.URL}} ]]></link>
<description><![CDATA[ {{.Shortname}} ]]></description>

    <item>
      <title><![CDATA[ {{.URL}} ]]></title>
      <content:encoded><![CDATA[ {{.LastContent}} ]]></content:encoded>
      <guid><![CDATA[ {{.URL}}/{{.LastModified.Format "20060102-150405"}} ]]></guid>
      <link><![CDATA[ {{.URL}} ]]></link>
      <pubDate>{{.LastModified.Format "Mon, 02 Jan 2006 15:04:05 -0700"}}</pubDate>
    </item>

</channel>
</rss>
`

func optPanic(err error) {
	if err != nil {
		panic(err)
	}
}

type command struct {
	shortname string
	URL       string
	selector  func(n *HtmlNode) bool
}

func (c *command) filename() string {
	return path.Join(CACHE_FOLDER, c.shortname)
}

func (c *command) getContent() string {
	b, err := misc.DownloadAll(c.URL, DL_LIMIT)
	optPanic(err)
	tmpdoc, err := xnetHtml.Parse(bytes.NewReader(b))
	optPanic(err)
	doc := (*HtmlNode)(tmpdoc)
	n := doc.Find(c.selector)
	var buf bytes.Buffer
	xnetHtml.Render(&buf, (*xnetHtml.Node)(n))
	return buf.String()
}

func unmarshalHPObject(filename string) HP {
	bytes, err := ioutil.ReadFile(filename)
	if err != nil {
		bytes = []byte{}
	}
	var hpObject HP
	err = json.Unmarshal(bytes, &hpObject)
	if err != nil {
		hpObject = HP{}
	}
	return hpObject
}

func marshalHPObject(filename string, hp HP) {
	bytes, err := json.MarshalIndent(hp, "", "  ")
	optPanic(err)
	err = ioutil.WriteFile(filename, bytes, 0644)
	optPanic(err)
}

func (c *command) genRSS() {
	content := c.getContent()
	hpObject := unmarshalHPObject(c.filename())
	hpObject.Shortname = c.shortname
	hpObject.URL = c.URL
	if content != hpObject.LastContent {
		hpObject.LastContent = content
		hpObject.LastModified = time.Now()
	}
	err := rssTemplate.Execute(os.Stdout, hpObject)
	optPanic(err)
	marshalHPObject(c.filename(), hpObject)
}

type HP struct {
	Shortname    string
	URL          string
	LastContent  string
	LastModified time.Time
}

var rssTemplate *template.Template

func main() {
	rssTemplate = template.Must(template.New("rss").Parse(RSS_TEMPLATE))
	os.Mkdir(CACHE_FOLDER, 0755)

	commands := []command{
		command{"stilldrinking", "https://www.stilldrinking.org/", IsTag("div").And(HasID("cont"))},
	}

	for _, command := range commands {
		if command.shortname == os.Args[1] {
			command.genRSS()
			os.Exit(0)
		}
	}
	fmt.Fprintln(os.Stderr, "unknown command", os.Args[1])
	os.Exit(-1)
}
Posted in programming
2020-11-27 14:25 UTC