Add linkchecker
This commit is contained in:
parent
0948cefe10
commit
62c62b5c33
5
linkchecker/go.mod
Normal file
5
linkchecker/go.mod
Normal file
@ -0,0 +1,5 @@
|
||||
module linkchecker
|
||||
|
||||
go 1.17
|
||||
|
||||
require golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect
|
2
linkchecker/go.sum
Normal file
2
linkchecker/go.sum
Normal file
@ -0,0 +1,2 @@
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
|
||||
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
|
184
linkchecker/linkchecker/linkchecker.go
Normal file
184
linkchecker/linkchecker/linkchecker.go
Normal file
@ -0,0 +1,184 @@
|
||||
package linkchecker
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"golang.org/x/net/html"
|
||||
"log"
|
||||
"net/http"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const (
|
||||
NotScanned = iota
|
||||
Success
|
||||
Broken
|
||||
)
|
||||
|
||||
type Link struct {
|
||||
Url string
|
||||
links []*Link
|
||||
Status uint64
|
||||
StatusCode int
|
||||
Parent *Link
|
||||
}
|
||||
|
||||
type LinkChecker struct {
|
||||
link *Link
|
||||
}
|
||||
|
||||
func New(url string) *LinkChecker {
|
||||
return &LinkChecker{
|
||||
link: &Link{
|
||||
Url: url,
|
||||
links: make([]*Link, 0),
|
||||
Status: NotScanned,
|
||||
Parent: nil,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *LinkChecker) Scan(depth uint64) {
|
||||
wg := &sync.WaitGroup{}
|
||||
|
||||
c.link.Scan(depth, wg)
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (c *LinkChecker) Print() {
|
||||
log.Printf("linkcheck %s", c.link.Url)
|
||||
|
||||
// Stats
|
||||
scanned := c.link.collectScanned()
|
||||
broken := make([]*Link, 0)
|
||||
c.link.collectBroken(&broken)
|
||||
log.Printf("%d links scanned, %d broken links found", scanned, len(broken))
|
||||
|
||||
for i, brokenLink := range broken {
|
||||
if brokenLink.Parent != nil {
|
||||
log.Printf("%d. Source page: %s", i+1, brokenLink.Parent.Url)
|
||||
} else {
|
||||
log.Printf("%d", i+1)
|
||||
}
|
||||
if brokenLink.StatusCode != 0 {
|
||||
log.Printf("Target: %s (status: %d)", brokenLink.Url, brokenLink.StatusCode)
|
||||
} else {
|
||||
log.Printf("Target: %s (status: Unknown error occurred)", brokenLink.Url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *Link) Scan(depth uint64, wg *sync.WaitGroup) {
|
||||
if depth == 0 {
|
||||
return
|
||||
}
|
||||
wg.Add(1)
|
||||
defer wg.Done()
|
||||
|
||||
log.Printf("Calling: %s\n", l.Url)
|
||||
response, err := http.Get(l.Url)
|
||||
if err != nil {
|
||||
l.Status = Broken
|
||||
return
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
l.StatusCode = response.StatusCode
|
||||
if response.StatusCode > 299 {
|
||||
log.Printf("Status was broken for: %s\n", l.Url)
|
||||
l.Status = Broken
|
||||
return
|
||||
}
|
||||
l.Status = Success
|
||||
|
||||
// log.Printf("Getting outgoing links for: %s\n", l.Url)
|
||||
links := getAllOutgoingLinks(response)
|
||||
l.links = links
|
||||
for _, link := range links {
|
||||
link.Parent = l
|
||||
go link.Scan(depth-1, wg)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (l *Link) collectScanned() int {
|
||||
if l.Status == NotScanned {
|
||||
return 0
|
||||
}
|
||||
|
||||
if l.Status == Broken {
|
||||
return 1
|
||||
}
|
||||
|
||||
sumOfChildrenScans := 0
|
||||
for _, link := range l.links {
|
||||
sumOfChildrenScans += link.collectScanned()
|
||||
}
|
||||
|
||||
return 1 + sumOfChildrenScans
|
||||
}
|
||||
|
||||
func (l *Link) collectBroken(brokenLinks *[]*Link) {
|
||||
if l.Status == NotScanned {
|
||||
return
|
||||
}
|
||||
|
||||
if l.Status == Broken {
|
||||
*brokenLinks = append(*brokenLinks, l)
|
||||
return
|
||||
}
|
||||
|
||||
for _, link := range l.links {
|
||||
link.collectBroken(brokenLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func traverse(n *html.Node, links *[]*Link) {
|
||||
if l, err := checkLink(n); err == nil {
|
||||
// log.Printf("Found link: %s\n", l.Url)
|
||||
*links = append(*links, l)
|
||||
}
|
||||
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
traverse(c, links)
|
||||
}
|
||||
}
|
||||
|
||||
func checkLink(n *html.Node) (*Link, error) {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
s, ok := getLinkElementWithSource(n)
|
||||
if ok {
|
||||
if len(s) > 0 && (s[0] == '/' || s[0] == '#') {
|
||||
return nil, errors.New("local url, cannot continue")
|
||||
}
|
||||
return &Link{
|
||||
Url: s,
|
||||
links: make([]*Link, 0),
|
||||
Status: 0,
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("was not an element")
|
||||
}
|
||||
|
||||
func getLinkElementWithSource(n *html.Node) (string, bool) {
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Key == "href" {
|
||||
return attr.Val, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
func getAllOutgoingLinks(response *http.Response) []*Link {
|
||||
doc, err := html.Parse(response.Body)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
links := make([]*Link, 0)
|
||||
traverse(doc, &links)
|
||||
|
||||
return links
|
||||
}
|
14
linkchecker/main.go
Normal file
14
linkchecker/main.go
Normal file
@ -0,0 +1,14 @@
|
||||
package main
|
||||
|
||||
import "linkchecker/linkchecker"
|
||||
|
||||
func main() {
|
||||
lc := linkchecker.New("https://bitfieldconsulting.com/golang/how")
|
||||
//lc := linkchecker.New("https://example.com")
|
||||
//lc := linkchecker.New("https://www.aksjhdfjkhashjkd.com/")
|
||||
//lc := linkchecker.New("https://lamehackersguide.blogspot.com/2017/02/weaponizing-postscript.html")
|
||||
|
||||
lc.Scan(3)
|
||||
|
||||
lc.Print()
|
||||
}
|
Loading…
Reference in New Issue
Block a user