From 62c62b5c339434a7c2edaa8a4e3012ec1c7aaa52 Mon Sep 17 00:00:00 2001 From: kjuulh Date: Wed, 23 Feb 2022 03:22:24 +0100 Subject: [PATCH] Add linkchecker --- linkchecker/go.mod | 5 + linkchecker/go.sum | 2 + linkchecker/linkchecker/linkchecker.go | 184 +++++++++++++++++++++++++ linkchecker/main.go | 14 ++ 4 files changed, 205 insertions(+) create mode 100644 linkchecker/go.mod create mode 100644 linkchecker/go.sum create mode 100644 linkchecker/linkchecker/linkchecker.go create mode 100644 linkchecker/main.go diff --git a/linkchecker/go.mod b/linkchecker/go.mod new file mode 100644 index 0000000..6bfba6b --- /dev/null +++ b/linkchecker/go.mod @@ -0,0 +1,5 @@ +module linkchecker + +go 1.17 + +require golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect diff --git a/linkchecker/go.sum b/linkchecker/go.sum new file mode 100644 index 0000000..aff97b6 --- /dev/null +++ b/linkchecker/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= diff --git a/linkchecker/linkchecker/linkchecker.go b/linkchecker/linkchecker/linkchecker.go new file mode 100644 index 0000000..1ec9244 --- /dev/null +++ b/linkchecker/linkchecker/linkchecker.go @@ -0,0 +1,184 @@ +package linkchecker + +import ( + "errors" + "golang.org/x/net/html" + "log" + "net/http" + "sync" +) + +const ( + NotScanned = iota + Success + Broken +) + +type Link struct { + Url string + links []*Link + Status uint64 + StatusCode int + Parent *Link +} + +type LinkChecker struct { + link *Link +} + +func New(url string) *LinkChecker { + return &LinkChecker{ + link: &Link{ + Url: url, + links: make([]*Link, 0), + Status: NotScanned, + Parent: nil, + }, + } +} + +func (c *LinkChecker) Scan(depth uint64) { + wg := &sync.WaitGroup{} + + c.link.Scan(depth, wg) + + wg.Wait() +} + +func (c *LinkChecker) Print() { + log.Printf("linkcheck %s", c.link.Url) + + // Stats + scanned := c.link.collectScanned() + broken := make([]*Link, 0) + c.link.collectBroken(&broken) + log.Printf("%d links scanned, %d broken links found", scanned, len(broken)) + + for i, brokenLink := range broken { + if brokenLink.Parent != nil { + log.Printf("%d. Source page: %s", i+1, brokenLink.Parent.Url) + } else { + log.Printf("%d", i+1) + } + if brokenLink.StatusCode != 0 { + log.Printf("Target: %s (status: %d)", brokenLink.Url, brokenLink.StatusCode) + } else { + log.Printf("Target: %s (status: Unknown error occurred)", brokenLink.Url) + } + } +} + +func (l *Link) Scan(depth uint64, wg *sync.WaitGroup) { + if depth == 0 { + return + } + wg.Add(1) + defer wg.Done() + + log.Printf("Calling: %s\n", l.Url) + response, err := http.Get(l.Url) + if err != nil { + l.Status = Broken + return + } + defer response.Body.Close() + + l.StatusCode = response.StatusCode + if response.StatusCode > 299 { + log.Printf("Status was broken for: %s\n", l.Url) + l.Status = Broken + return + } + l.Status = Success + + // log.Printf("Getting outgoing links for: %s\n", l.Url) + links := getAllOutgoingLinks(response) + l.links = links + for _, link := range links { + link.Parent = l + go link.Scan(depth-1, wg) + } + +} + +func (l *Link) collectScanned() int { + if l.Status == NotScanned { + return 0 + } + + if l.Status == Broken { + return 1 + } + + sumOfChildrenScans := 0 + for _, link := range l.links { + sumOfChildrenScans += link.collectScanned() + } + + return 1 + sumOfChildrenScans +} + +func (l *Link) collectBroken(brokenLinks *[]*Link) { + if l.Status == NotScanned { + return + } + + if l.Status == Broken { + *brokenLinks = append(*brokenLinks, l) + return + } + + for _, link := range l.links { + link.collectBroken(brokenLinks) + } +} + +func traverse(n *html.Node, links *[]*Link) { + if l, err := checkLink(n); err == nil { + // log.Printf("Found link: %s\n", l.Url) + *links = append(*links, l) + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + traverse(c, links) + } +} + +func checkLink(n *html.Node) (*Link, error) { + if n.Type == html.ElementNode && n.Data == "a" { + s, ok := getLinkElementWithSource(n) + if ok { + if len(s) > 0 && (s[0] == '/' || s[0] == '#') { + return nil, errors.New("local url, cannot continue") + } + return &Link{ + Url: s, + links: make([]*Link, 0), + Status: 0, + }, nil + } + } + + return nil, errors.New("was not an element") +} + +func getLinkElementWithSource(n *html.Node) (string, bool) { + for _, attr := range n.Attr { + if attr.Key == "href" { + return attr.Val, true + } + } + return "", false +} + +func getAllOutgoingLinks(response *http.Response) []*Link { + doc, err := html.Parse(response.Body) + if err != nil { + return nil + } + + links := make([]*Link, 0) + traverse(doc, &links) + + return links +} diff --git a/linkchecker/main.go b/linkchecker/main.go new file mode 100644 index 0000000..6ca0a37 --- /dev/null +++ b/linkchecker/main.go @@ -0,0 +1,14 @@ +package main + +import "linkchecker/linkchecker" + +func main() { + lc := linkchecker.New("https://bitfieldconsulting.com/golang/how") + //lc := linkchecker.New("https://example.com") + //lc := linkchecker.New("https://www.aksjhdfjkhashjkd.com/") + //lc := linkchecker.New("https://lamehackersguide.blogspot.com/2017/02/weaponizing-postscript.html") + + lc.Scan(3) + + lc.Print() +}