185 lines
3.3 KiB
Go
185 lines
3.3 KiB
Go
package linkchecker
|
|
|
|
import (
|
|
"errors"
|
|
"golang.org/x/net/html"
|
|
"log"
|
|
"net/http"
|
|
"sync"
|
|
)
|
|
|
|
const (
|
|
NotScanned = iota
|
|
Success
|
|
Broken
|
|
)
|
|
|
|
type Link struct {
|
|
Url string
|
|
links []*Link
|
|
Status uint64
|
|
StatusCode int
|
|
Parent *Link
|
|
}
|
|
|
|
type LinkChecker struct {
|
|
link *Link
|
|
}
|
|
|
|
func New(url string) *LinkChecker {
|
|
return &LinkChecker{
|
|
link: &Link{
|
|
Url: url,
|
|
links: make([]*Link, 0),
|
|
Status: NotScanned,
|
|
Parent: nil,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (c *LinkChecker) Scan(depth uint64) {
|
|
wg := &sync.WaitGroup{}
|
|
|
|
c.link.Scan(depth, wg)
|
|
|
|
wg.Wait()
|
|
}
|
|
|
|
func (c *LinkChecker) Print() {
|
|
log.Printf("linkcheck %s", c.link.Url)
|
|
|
|
// Stats
|
|
scanned := c.link.collectScanned()
|
|
broken := make([]*Link, 0)
|
|
c.link.collectBroken(&broken)
|
|
log.Printf("%d links scanned, %d broken links found", scanned, len(broken))
|
|
|
|
for i, brokenLink := range broken {
|
|
if brokenLink.Parent != nil {
|
|
log.Printf("%d. Source page: %s", i+1, brokenLink.Parent.Url)
|
|
} else {
|
|
log.Printf("%d", i+1)
|
|
}
|
|
if brokenLink.StatusCode != 0 {
|
|
log.Printf("Target: %s (status: %d)", brokenLink.Url, brokenLink.StatusCode)
|
|
} else {
|
|
log.Printf("Target: %s (status: Unknown error occurred)", brokenLink.Url)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Link) Scan(depth uint64, wg *sync.WaitGroup) {
|
|
if depth == 0 {
|
|
return
|
|
}
|
|
wg.Add(1)
|
|
defer wg.Done()
|
|
|
|
log.Printf("Calling: %s\n", l.Url)
|
|
response, err := http.Get(l.Url)
|
|
if err != nil {
|
|
l.Status = Broken
|
|
return
|
|
}
|
|
defer response.Body.Close()
|
|
|
|
l.StatusCode = response.StatusCode
|
|
if response.StatusCode > 299 {
|
|
log.Printf("Status was broken for: %s\n", l.Url)
|
|
l.Status = Broken
|
|
return
|
|
}
|
|
l.Status = Success
|
|
|
|
// log.Printf("Getting outgoing links for: %s\n", l.Url)
|
|
links := getAllOutgoingLinks(response)
|
|
l.links = links
|
|
for _, link := range links {
|
|
link.Parent = l
|
|
go link.Scan(depth-1, wg)
|
|
}
|
|
|
|
}
|
|
|
|
func (l *Link) collectScanned() int {
|
|
if l.Status == NotScanned {
|
|
return 0
|
|
}
|
|
|
|
if l.Status == Broken {
|
|
return 1
|
|
}
|
|
|
|
sumOfChildrenScans := 0
|
|
for _, link := range l.links {
|
|
sumOfChildrenScans += link.collectScanned()
|
|
}
|
|
|
|
return 1 + sumOfChildrenScans
|
|
}
|
|
|
|
func (l *Link) collectBroken(brokenLinks *[]*Link) {
|
|
if l.Status == NotScanned {
|
|
return
|
|
}
|
|
|
|
if l.Status == Broken {
|
|
*brokenLinks = append(*brokenLinks, l)
|
|
return
|
|
}
|
|
|
|
for _, link := range l.links {
|
|
link.collectBroken(brokenLinks)
|
|
}
|
|
}
|
|
|
|
func traverse(n *html.Node, links *[]*Link) {
|
|
if l, err := checkLink(n); err == nil {
|
|
// log.Printf("Found link: %s\n", l.Url)
|
|
*links = append(*links, l)
|
|
}
|
|
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
traverse(c, links)
|
|
}
|
|
}
|
|
|
|
func checkLink(n *html.Node) (*Link, error) {
|
|
if n.Type == html.ElementNode && n.Data == "a" {
|
|
s, ok := getLinkElementWithSource(n)
|
|
if ok {
|
|
if len(s) > 0 && (s[0] == '/' || s[0] == '#') {
|
|
return nil, errors.New("local url, cannot continue")
|
|
}
|
|
return &Link{
|
|
Url: s,
|
|
links: make([]*Link, 0),
|
|
Status: 0,
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
return nil, errors.New("was not an element")
|
|
}
|
|
|
|
func getLinkElementWithSource(n *html.Node) (string, bool) {
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "href" {
|
|
return attr.Val, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func getAllOutgoingLinks(response *http.Response) []*Link {
|
|
doc, err := html.Parse(response.Body)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
links := make([]*Link, 0)
|
|
traverse(doc, &links)
|
|
|
|
return links
|
|
}
|