Add linkchecker

This commit is contained in:
Kasper Juul Hermansen 2022-02-23 03:22:24 +01:00
parent 0948cefe10
commit 62c62b5c33
Signed by: kjuulh
GPG Key ID: 0F95C140730F2F23
4 changed files with 205 additions and 0 deletions

5
linkchecker/go.mod Normal file
View File

@ -0,0 +1,5 @@
module linkchecker
go 1.17
require golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect

2
linkchecker/go.sum Normal file
View File

@ -0,0 +1,2 @@
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=

View File

@ -0,0 +1,184 @@
package linkchecker
import (
"errors"
"golang.org/x/net/html"
"log"
"net/http"
"sync"
)
const (
NotScanned = iota
Success
Broken
)
type Link struct {
Url string
links []*Link
Status uint64
StatusCode int
Parent *Link
}
type LinkChecker struct {
link *Link
}
func New(url string) *LinkChecker {
return &LinkChecker{
link: &Link{
Url: url,
links: make([]*Link, 0),
Status: NotScanned,
Parent: nil,
},
}
}
func (c *LinkChecker) Scan(depth uint64) {
wg := &sync.WaitGroup{}
c.link.Scan(depth, wg)
wg.Wait()
}
func (c *LinkChecker) Print() {
log.Printf("linkcheck %s", c.link.Url)
// Stats
scanned := c.link.collectScanned()
broken := make([]*Link, 0)
c.link.collectBroken(&broken)
log.Printf("%d links scanned, %d broken links found", scanned, len(broken))
for i, brokenLink := range broken {
if brokenLink.Parent != nil {
log.Printf("%d. Source page: %s", i+1, brokenLink.Parent.Url)
} else {
log.Printf("%d", i+1)
}
if brokenLink.StatusCode != 0 {
log.Printf("Target: %s (status: %d)", brokenLink.Url, brokenLink.StatusCode)
} else {
log.Printf("Target: %s (status: Unknown error occurred)", brokenLink.Url)
}
}
}
func (l *Link) Scan(depth uint64, wg *sync.WaitGroup) {
if depth == 0 {
return
}
wg.Add(1)
defer wg.Done()
log.Printf("Calling: %s\n", l.Url)
response, err := http.Get(l.Url)
if err != nil {
l.Status = Broken
return
}
defer response.Body.Close()
l.StatusCode = response.StatusCode
if response.StatusCode > 299 {
log.Printf("Status was broken for: %s\n", l.Url)
l.Status = Broken
return
}
l.Status = Success
// log.Printf("Getting outgoing links for: %s\n", l.Url)
links := getAllOutgoingLinks(response)
l.links = links
for _, link := range links {
link.Parent = l
go link.Scan(depth-1, wg)
}
}
func (l *Link) collectScanned() int {
if l.Status == NotScanned {
return 0
}
if l.Status == Broken {
return 1
}
sumOfChildrenScans := 0
for _, link := range l.links {
sumOfChildrenScans += link.collectScanned()
}
return 1 + sumOfChildrenScans
}
func (l *Link) collectBroken(brokenLinks *[]*Link) {
if l.Status == NotScanned {
return
}
if l.Status == Broken {
*brokenLinks = append(*brokenLinks, l)
return
}
for _, link := range l.links {
link.collectBroken(brokenLinks)
}
}
func traverse(n *html.Node, links *[]*Link) {
if l, err := checkLink(n); err == nil {
// log.Printf("Found link: %s\n", l.Url)
*links = append(*links, l)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
traverse(c, links)
}
}
func checkLink(n *html.Node) (*Link, error) {
if n.Type == html.ElementNode && n.Data == "a" {
s, ok := getLinkElementWithSource(n)
if ok {
if len(s) > 0 && (s[0] == '/' || s[0] == '#') {
return nil, errors.New("local url, cannot continue")
}
return &Link{
Url: s,
links: make([]*Link, 0),
Status: 0,
}, nil
}
}
return nil, errors.New("was not an element")
}
func getLinkElementWithSource(n *html.Node) (string, bool) {
for _, attr := range n.Attr {
if attr.Key == "href" {
return attr.Val, true
}
}
return "", false
}
func getAllOutgoingLinks(response *http.Response) []*Link {
doc, err := html.Parse(response.Body)
if err != nil {
return nil
}
links := make([]*Link, 0)
traverse(doc, &links)
return links
}

14
linkchecker/main.go Normal file
View File

@ -0,0 +1,14 @@
package main
import "linkchecker/linkchecker"
func main() {
lc := linkchecker.New("https://bitfieldconsulting.com/golang/how")
//lc := linkchecker.New("https://example.com")
//lc := linkchecker.New("https://www.aksjhdfjkhashjkd.com/")
//lc := linkchecker.New("https://lamehackersguide.blogspot.com/2017/02/weaponizing-postscript.html")
lc.Scan(3)
lc.Print()
}