Created
July 25, 2023 07:23
-
-
Save ucalyptus2/0fb5f2548a871e3dd247582a7fb02228 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
from transformers.pipelines.pt_utils import KeyDataset | |
from transformers import pipeline as pipesetup | |
import tqdm | |
import jiwer | |
# Use a streaming dataset | |
fleurs = load_dataset("google/fleurs", "bn_in", split="train", streaming=True) | |
# Get an iterator for the dataset | |
iterator = iter(fleurs) | |
pred, gt = [], [] | |
pipeline = pipesetup(model="ai4bharat/indicwav2vec_v1_bengali") | |
for i in tqdm.tqdm(range(1000)): # Adjust the range according to your requirement | |
# Fetch one item from the dataset | |
data = next(iterator) | |
#breakpoint() | |
inp, out = data["raw_transcription"], pipeline(data["audio"]['array'], max_new_tokens=448) | |
gt.append(inp) | |
pred.append(out['text']) | |
if (i+1) % 10 == 0: | |
print(f'WER after {i+1} samples: {jiwer.wer(gt,pred)}') | |
# WER computation after all the samples have been processed | |
final_wer = jiwer.wer(gt, pred) | |
print(f'Final WER: {final_wer}') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment