Created
April 23, 2025 16:25
-
-
Save Pastor/ce616ee861146985251456ee1429633a to your computer and use it in GitHub Desktop.
goParser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package html | |
import ( | |
"golang.org/x/net/html" | |
"net/http" | |
) | |
type Filter func(tagName string) bool | |
type Extractor func(node *html.Node) Data | |
type Data any | |
type Parser interface { | |
Import(url string) ([]Data, error) | |
} | |
type parser struct { | |
filter Filter | |
extractor Extractor | |
} | |
func (p parser) Import(url string) ([]Data, error) { | |
ret, err := http.Get(url) | |
if err != nil { | |
return nil, err | |
} | |
defer ret.Body.Close() | |
data, err := html.Parse(ret.Body) | |
if err != nil { | |
return nil, err | |
} | |
result := make([]Data, 0) | |
for n := range data.Descendants() { | |
if n.Type == html.ElementNode && p.filter(n.Data) { | |
d := p.extractor(n) | |
if d != nil { | |
result = append(result, d) | |
} | |
} | |
} | |
return result, nil | |
} | |
func NewParser(filter Filter, extractor Extractor) Parser { | |
return &parser{ | |
filter: filter, | |
extractor: extractor, | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package html | |
import ( | |
"fmt" | |
"golang.org/x/net/html" | |
"testing" | |
) | |
func TestParser(t *testing.T) { | |
p := NewParser(func(tagName string) bool { | |
return tagName == "img" | |
}, func(node *html.Node) Data { | |
for i := range node.Attr { | |
attr := node.Attr[i] | |
if attr.Key == "src" { | |
return attr.Val | |
} | |
} | |
return nil | |
}) | |
ret, _ := p.Import("https://skillfactory.ru") | |
for i := range ret { | |
href := ret[i].(string) | |
fmt.Println(href) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment