Created
May 13, 2025 23:29
-
-
Save SR-G/c7f055f9341606d8621fd4cf52eb967a to your computer and use it in GitHub Desktop.
GOLANG code to absorb duplicated lines in VTT youtube automated transcripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"fmt" | |
"strings" | |
"github.com/asticode/go-astisub" | |
) | |
func main() { | |
input := `WEBVTT | |
Kind: captions | |
Language: fr | |
00:00:59.150 --> 00:01:00.790 align:start position:0% | |
forme déshydratée et dans ce cas vous | |
penserez<00:00:59.450><c> à</c><00:00:59.540><c> les</c><00:00:59.750><c> réhydrater</c><00:00:59.960><c> dans</c><00:01:00.530><c> de</c><00:01:00.680><c> l'eau</c> | |
00:01:00.790 --> 00:01:00.800 align:start position:0% | |
penserez à les réhydrater dans de l'eau | |
00:01:00.800 --> 00:01:05.400 align:start position:0% | |
penserez à les réhydrater dans de l'eau | |
chaude<00:01:00.830><c> une</c><00:01:01.400><c> voire</c><00:01:01.730><c> deux</c><00:01:01.940><c> heures</c><00:01:02.090><c> à</c><00:01:02.240><c> l'avance</c> | |
00:01:05.400 --> 00:01:05.410 align:start position:0% | |
00:01:05.410 --> 00:01:07.990 align:start position:0% | |
si<00:01:06.410><c> vous</c><00:01:06.530><c> n'en</c><00:01:06.650><c> trouvez</c><00:01:06.680><c> pas</c><00:01:07.070><c> à</c><00:01:07.130><c> proximité</c><00:01:07.580><c> de</c> | |
00:01:07.990 --> 00:01:08.000 align:start position:0% | |
si vous n'en trouvez pas à proximité de | |
00:01:08.000 --> 00:01:09.460 align:start position:0% | |
si vous n'en trouvez pas à proximité de | |
chez<00:01:08.090><c> vous</c><00:01:08.270><c> prenez</c><00:01:08.660><c> simplement</c><00:01:08.870><c> une</c><00:01:09.350><c> autre</c> | |
00:01:09.460 --> 00:01:09.470 align:start position:0% | |
chez vous prenez simplement une autre | |
00:01:09.470 --> 00:01:10.930 align:start position:0% | |
chez vous prenez simplement une autre | |
variété<00:01:09.800><c> de</c><00:01:09.950><c> champignons</c><00:01:10.430><c> comme</c><00:01:10.760><c> des</c> | |
00:01:10.930 --> 00:01:10.940 align:start position:0% | |
variété de champignons comme des | |
00:01:10.940 --> 00:01:13.380 align:start position:0% | |
variété de champignons comme des | |
champions<00:01:11.300><c> de</c><00:01:11.420><c> parler</c><00:01:11.630><c> ça</c><00:01:11.810><c> sera</c><00:01:12.080><c> très</c><00:01:12.410><c> bien</c> | |
00:01:13.380 --> 00:01:13.390 align:start position:0% | |
champions de parler ça sera très bien` | |
fmt.Println("Cleaned transcript : \n" + GetCleanTranscriptContent(input)) | |
} | |
func isStringFoundInLastEntriesOfSlice(items []string, s string, max int) bool { | |
stopIndex := len(items) - max | |
if stopIndex < 0 { | |
stopIndex = 0 | |
} | |
for i := len(items) - 1; i >= stopIndex; i-- { | |
if strings.Contains(s, items[i]) { | |
return true | |
} | |
} | |
return false | |
} | |
func GetCleanTranscriptContent(input string) string { | |
subtitles, err := astisub.ReadFromWebVTT(bytes.NewReader([]byte(input))) | |
if err != nil { | |
fmt.Println("Can't parse transcript", err) | |
} | |
cleanedTranscript := "" | |
previousLines := []string{} | |
for _, item := range subtitles.Items { | |
for _, line := range item.Lines { | |
currentItem := "" | |
sep := "" | |
for _, lineItem := range line.Items { | |
s := strings.TrimSpace(strings.ReplaceAll(lineItem.Text, "\n", "")) | |
currentItem = currentItem + sep + s | |
sep = " " | |
} | |
if !isStringFoundInLastEntriesOfSlice(previousLines, currentItem, 10) { | |
previousLines = append(previousLines, currentItem) | |
} | |
} | |
} | |
cleanedTranscript = strings.Join(previousLines, "\n") | |
return cleanedTranscript | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Playground : https://go.dev/play/p/qzeEtPOILK9