mirror of https://github.com/pikami/rss-dl.git
Added html parser
This commit is contained in:
parent
d3cf54472a
commit
01f7879f7e
|
@ -9,10 +9,10 @@ jobs:
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Set up Go 1.13
|
- name: Set up Go 1.16
|
||||||
uses: actions/setup-go@v1
|
uses: actions/setup-go@v1
|
||||||
with:
|
with:
|
||||||
go-version: 1.13
|
go-version: 1.16
|
||||||
id: go
|
id: go
|
||||||
|
|
||||||
- name: Check out code into the Go module directory
|
- name: Check out code into the Go module directory
|
||||||
|
|
|
@ -11,6 +11,7 @@ You can download feeds by running `./rss-dl [Options] FEED_URL`
|
||||||
|
|
||||||
## Available options
|
## Available options
|
||||||
* `-output some_directory` - Output path (default ".")
|
* `-output some_directory` - Output path (default ".")
|
||||||
|
* `-parsehtml` - Save content as html
|
||||||
|
|
||||||
## Acknowledgments
|
## Acknowledgments
|
||||||
This software uses the gofeed parser which can be found here: https://github.com/mmcdole/gofeed
|
This software uses the gofeed parser which can be found here: https://github.com/mmcdole/gofeed
|
||||||
|
|
6
go.mod
6
go.mod
|
@ -2,4 +2,8 @@ module github.com/pikami/rss-dl
|
||||||
|
|
||||||
go 1.16
|
go 1.16
|
||||||
|
|
||||||
require github.com/mmcdole/gofeed v1.1.3
|
require (
|
||||||
|
github.com/PuerkitoBio/goquery v1.6.1 // indirect
|
||||||
|
github.com/mmcdole/gofeed v1.1.3
|
||||||
|
golang.org/x/net v0.0.0-20200301022130-244492dfa37a // indirect
|
||||||
|
)
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -1,6 +1,8 @@
|
||||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||||
|
github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
|
||||||
|
github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||||
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
|
||||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
package htmlparse
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
|
||||||
|
fileio "github.com/pikami/rss-dl/fileio"
|
||||||
|
helpers "github.com/pikami/rss-dl/helpers"
|
||||||
|
)
|
||||||
|
|
||||||
|
func HtmlGrab(htmlStr string, itemOutputDir string) {
|
||||||
|
rootNode, err := html.Parse(strings.NewReader(htmlStr))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init download dir
|
||||||
|
outputDir := itemOutputDir + "/html"
|
||||||
|
fileio.InitOutputDirectory(outputDir)
|
||||||
|
|
||||||
|
// Load the HTML document
|
||||||
|
doc := goquery.NewDocumentFromNode(rootNode)
|
||||||
|
|
||||||
|
// Download assets
|
||||||
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
|
// For each item found, get the title
|
||||||
|
val, exists := s.Attr("src")
|
||||||
|
if exists {
|
||||||
|
imageName := helpers.RemoveGetParams(filepath.Base(val))
|
||||||
|
itemImagePath := outputDir + "/" + imageName
|
||||||
|
helpers.LogInfo("Downloading image to " + itemImagePath)
|
||||||
|
fileio.DownloadFile(
|
||||||
|
itemImagePath,
|
||||||
|
val)
|
||||||
|
|
||||||
|
fmt.Printf("[htmlgrab] %d: %s\n", i, val)
|
||||||
|
|
||||||
|
s.SetAttr("src", imageName)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
newHtml, err := doc.Html()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fileio.WriteToFile(outputDir+"/index.html", newHtml)
|
||||||
|
}
|
11
main.go
11
main.go
|
@ -9,6 +9,7 @@ import (
|
||||||
|
|
||||||
fileio "github.com/pikami/rss-dl/fileio"
|
fileio "github.com/pikami/rss-dl/fileio"
|
||||||
helpers "github.com/pikami/rss-dl/helpers"
|
helpers "github.com/pikami/rss-dl/helpers"
|
||||||
|
htmlgrab "github.com/pikami/rss-dl/htmlgrab"
|
||||||
structs "github.com/pikami/rss-dl/structs"
|
structs "github.com/pikami/rss-dl/structs"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -26,8 +27,10 @@ func main() {
|
||||||
helpers.LogInfo("Writing feed details as JSON to " + feedInfoPath)
|
helpers.LogInfo("Writing feed details as JSON to " + feedInfoPath)
|
||||||
fileio.WriteToFile(feedInfoPath, GrabFeedDetailsJSON(feed))
|
fileio.WriteToFile(feedInfoPath, GrabFeedDetailsJSON(feed))
|
||||||
|
|
||||||
feedImagePath := outputDir + "/image" + helpers.RemoveGetParams(filepath.Ext(feed.Image.URL))
|
if feed.Image != nil {
|
||||||
fileio.DownloadFile(feedImagePath, feed.Image.URL)
|
feedImagePath := outputDir + "/image" + helpers.RemoveGetParams(filepath.Ext(feed.Image.URL))
|
||||||
|
fileio.DownloadFile(feedImagePath, feed.Image.URL)
|
||||||
|
}
|
||||||
|
|
||||||
for _, item := range feed.Items {
|
for _, item := range feed.Items {
|
||||||
itemOutputFilename := helpers.ToCleanString(
|
itemOutputFilename := helpers.ToCleanString(
|
||||||
|
@ -63,6 +66,10 @@ func main() {
|
||||||
itemOutputDir+"/"+filename,
|
itemOutputDir+"/"+filename,
|
||||||
enclosure.URL)
|
enclosure.URL)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if structs.Config.ParseHtml {
|
||||||
|
htmlgrab.HtmlGrab(item.Content, itemOutputDir)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,11 +10,13 @@ import (
|
||||||
var Config struct {
|
var Config struct {
|
||||||
FeedURL string
|
FeedURL string
|
||||||
OutputPath string
|
OutputPath string
|
||||||
|
ParseHtml bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetConfig - Returns Config object
|
// GetConfig - Returns Config object
|
||||||
func GetConfig() {
|
func GetConfig() {
|
||||||
outputPath := flag.String("output", ".", "Output path")
|
outputPath := flag.String("output", ".", "Output path")
|
||||||
|
parseHtml := flag.Bool("parsehtml", false, "Save content as html")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
@ -26,4 +28,5 @@ func GetConfig() {
|
||||||
|
|
||||||
Config.FeedURL = flag.Args()[len(args)-1]
|
Config.FeedURL = flag.Args()[len(args)-1]
|
||||||
Config.OutputPath = *outputPath
|
Config.OutputPath = *outputPath
|
||||||
|
Config.ParseHtml = *parseHtml
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue