Last active
August 29, 2023 11:38
-
-
Save inotnako/c4a82f6723f6ccea5d83c5d3689373dd to your computer and use it in GitHub Desktop.
get meta from html page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"net/http" | |
"net/url" | |
"golang.org/x/net/html" | |
"io" | |
) | |
func main() { | |
http.HandleFunc(`/read`, func(rw http.ResponseWriter, req *http.Request) { | |
rw.Header().Set(`Content-Type`, `application/json`) | |
err := req.ParseForm() | |
if err != nil { | |
rw.WriteHeader(http.StatusBadRequest) | |
json.NewEncoder(rw).Encode(map[string]string{"error": err.Error()}) | |
return | |
} | |
link := req.FormValue(`link`) | |
if link == "" { | |
rw.WriteHeader(http.StatusBadRequest) | |
json.NewEncoder(rw).Encode(map[string]string{"error": `empty value of link`}) | |
return | |
} | |
if _, err := url.Parse(link); err != nil { | |
rw.WriteHeader(http.StatusBadRequest) | |
json.NewEncoder(rw).Encode(map[string]string{"error": err.Error()}) | |
return | |
} | |
resp, err := http.Get(link) | |
if err != nil { | |
//proxy status and err | |
rw.WriteHeader(resp.StatusCode) | |
json.NewEncoder(rw).Encode(map[string]string{"error": err.Error()}) | |
return | |
} | |
defer resp.Body.Close() | |
meta := extract(resp.Body) | |
rw.WriteHeader(http.StatusOK) | |
json.NewEncoder(rw).Encode(meta) | |
return | |
}) | |
// little help %) | |
println("call like: \n$ curl -XPOST 'http://localhost:4567/read' -d link='https://github.com/golang/go'") | |
println(`{"title":"golang/go","description":"go - The Go programming language","image":"https://avatars1.githubusercontent.com/u/4314092?v=3\u0026s=400","site_name":"GitHub"}`) | |
err := http.ListenAndServe(`:4567`, nil) | |
if err != nil { | |
panic(err) | |
} | |
} | |
type HTMLMeta struct { | |
Title string `json:"title"` | |
Description string `json:"description"` | |
Image string `json:"image"` | |
SiteName string `json:"site_name"` | |
} | |
func extract(resp io.Reader) *HTMLMeta { | |
z := html.NewTokenizer(resp) | |
titleFound := false | |
hm := new(HTMLMeta) | |
for { | |
tt := z.Next() | |
switch tt { | |
case html.ErrorToken: | |
return hm | |
case html.StartTagToken, html.SelfClosingTagToken: | |
t := z.Token() | |
if t.Data == `body` { | |
return hm | |
} | |
if t.Data == "title" { | |
titleFound = true | |
} | |
if t.Data == "meta" { | |
desc, ok := extractMetaProperty(t, "description") | |
if ok { | |
hm.Description = desc | |
} | |
ogTitle, ok := extractMetaProperty(t, "og:title") | |
if ok { | |
hm.Title = ogTitle | |
} | |
ogDesc, ok := extractMetaProperty(t, "og:description") | |
if ok { | |
hm.Description = ogDesc | |
} | |
ogImage, ok := extractMetaProperty(t, "og:image") | |
if ok { | |
hm.Image = ogImage | |
} | |
ogSiteName, ok := extractMetaProperty(t, "og:site_name") | |
if ok { | |
hm.SiteName = ogSiteName | |
} | |
} | |
case html.TextToken: | |
if titleFound { | |
t := z.Token() | |
hm.Title = t.Data | |
titleFound = false | |
} | |
} | |
} | |
return hm | |
} | |
func extractMetaProperty(t html.Token, prop string) (content string, ok bool) { | |
for _, attr := range t.Attr { | |
if attr.Key == "property" && attr.Val == prop { | |
ok = true | |
} | |
if attr.Key == "content" { | |
content = attr.Val | |
} | |
} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very clean and impressive ๐