rmayormartins commited on
Commit
745fe01
β€’
1 Parent(s): 956a08e

Subindo arquivos

Browse files
README.md CHANGED
@@ -1,13 +1,65 @@
1
  ---
2
- title: Speech Accent Pt Br Classifier
3
- emoji: πŸ“š
4
  colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
- license: ecl-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Speech-accent-pt-br-classifier
3
+ emoji: πŸŽ™οΈπŸ€–πŸ‡§πŸ‡·
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: "4.12.0"
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # Speech Portuguese (Brazilian) Accent Classifier
13
+
14
+ This project is a speech accent classifier that distinguishes between Portuguese (Brazilian) and other accents.
15
+
16
+ ## Project Overview
17
+
18
+ This application uses a trained model to classify speech accents into two categories:
19
+ 1. Portuguese (Brazilian)
20
+ 2. Other
21
+
22
+ The model is based on the author's work [(results) brazil pt accent] and utilizes the Portuguese portion of the Common Voice dataset (version 11.0) from Mozilla Foundation.
23
+
24
+ ## Dataset
25
+
26
+ The project uses the Portuguese subset of the Common Voice dataset:
27
+ - Dataset: "mozilla-foundation/common_voice_11_0", "pt"
28
+
29
+ Brazilian accents included in the dataset:
30
+ - PortuguΓͺs do Brasil, RegiΓ£o Sul do Brasil
31
+ - Paulistano
32
+ - Paulista, Brasileiro
33
+ - Carioca
34
+ - Mato Grosso
35
+ - Mineiro
36
+ - Interior Paulista
37
+ - GaΓΊcho
38
+ - Nordestino
39
+ - And various regional mixes
40
+
41
+ ## Technical Details
42
+
43
+ The project utilizes the following model and processor:
44
+ - Model: "facebook/wav2vec2-base-960h"
45
+ - Processor: Wav2Vec2Processor.from_pretrained
46
+
47
+ ## License
48
+
49
+ ecl
50
+
51
+ ## Developer Information
52
+
53
+ Developed by Ramon Mayor Martins (2024)
54
+ - Email: rmayormartins@gmail.com
55
+ - Homepage: https://rmayormartins.github.io/
56
+ - Twitter: @rmayormartins
57
+ - GitHub: https://github.com/rmayormartins
58
+
59
+ ## Acknowledgements
60
+
61
+ Special thanks to Instituto Federal de Santa Catarina (Federal Institute of Santa Catarina) IFSC-SΓ£o JosΓ©-Brazil.
62
+
63
+ ## Contact
64
+
65
+ For any queries or suggestions, please contact the developer using the information provided above.
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
5
+
6
+ # modelo e o processador salvos
7
+ model_name = "results"
8
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
9
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
10
+
11
+ def classify_accent(audio):
12
+ if audio is None:
13
+ return "Erro: Nenhum Γ‘udio recebido"
14
+
15
+ # entrada
16
+ print(f"Tipo de entrada de Γ‘udio: {type(audio)}")
17
+
18
+ # O Γ‘udio formato
19
+ print(f"Received audio input: {audio}")
20
+
21
+ try:
22
+ audio_array = audio[1] # O Γ‘udio da tupla
23
+ sample_rate = audio[0] # A taxa de amostragem da tupla
24
+
25
+ print(f"Shape do Γ‘udio: {audio_array.shape}, Taxa de amostragem: {sample_rate}")
26
+
27
+ #
28
+ audio_array = audio_array.astype(np.float32)
29
+
30
+ # taxa de amostragem
31
+ if sample_rate != 16000:
32
+ import librosa
33
+ audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
34
+
35
+ input_values = processor(audio_array, return_tensors="pt", sampling_rate=16000).input_values
36
+ # Inf
37
+ with torch.no_grad():
38
+ logits = model(input_values).logits
39
+ predicted_ids = torch.argmax(logits, dim=-1).item()
40
+
41
+ # ids accent
42
+ labels = ["Brazilian", "Outro"]
43
+ return labels[predicted_ids]
44
+
45
+ except Exception as e:
46
+ return f"Erro ao processar o Γ‘udio: {str(e)}"
47
+
48
+ # Interface do Gradio
49
+ interface = gr.Interface(fn=classify_accent, inputs=gr.Audio(type="numpy"), outputs="label")
50
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.29.0
2
+ torch
3
+ transformers
4
+ librosa
5
+ numpy
results/config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base-960h",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "group",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_dropout_prob": 0.1,
60
+ "hidden_size": 768,
61
+ "initializer_range": 0.02,
62
+ "intermediate_size": 3072,
63
+ "layer_norm_eps": 1e-05,
64
+ "layerdrop": 0.1,
65
+ "mask_feature_length": 10,
66
+ "mask_feature_min_masks": 0,
67
+ "mask_feature_prob": 0.0,
68
+ "mask_time_length": 10,
69
+ "mask_time_min_masks": 2,
70
+ "mask_time_prob": 0.05,
71
+ "model_type": "wav2vec2",
72
+ "num_adapter_layers": 3,
73
+ "num_attention_heads": 12,
74
+ "num_codevector_groups": 2,
75
+ "num_codevectors_per_group": 320,
76
+ "num_conv_pos_embedding_groups": 16,
77
+ "num_conv_pos_embeddings": 128,
78
+ "num_feat_extract_layers": 7,
79
+ "num_hidden_layers": 12,
80
+ "num_negatives": 100,
81
+ "output_hidden_size": 768,
82
+ "pad_token_id": 0,
83
+ "proj_codevector_dim": 256,
84
+ "tdnn_dilation": [
85
+ 1,
86
+ 2,
87
+ 3,
88
+ 1,
89
+ 1
90
+ ],
91
+ "tdnn_dim": [
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 512,
96
+ 1500
97
+ ],
98
+ "tdnn_kernel": [
99
+ 5,
100
+ 3,
101
+ 3,
102
+ 1,
103
+ 1
104
+ ],
105
+ "torch_dtype": "float32",
106
+ "transformers_version": "4.41.2",
107
+ "use_weighted_layer_sum": false,
108
+ "vocab_size": 32,
109
+ "xvector_output_dim": 512
110
+ }
results/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2832c855da20b63ee6353ba032d27bd9c5c4920c64bc1fe49fe5d498f6f87d0
3
+ size 378302360
results/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
results/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
results/tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "do_normalize": true,
40
+ "eos_token": "</s>",
41
+ "model_max_length": 1000000000000000019884624838656,
42
+ "pad_token": "<pad>",
43
+ "processor_class": "Wav2Vec2Processor",
44
+ "replace_word_delimiter_char": " ",
45
+ "return_attention_mask": false,
46
+ "target_lang": null,
47
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
48
+ "unk_token": "<unk>",
49
+ "word_delimiter_token": "|"
50
+ }
results/vocab.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "'": 27,
3
+ "</s>": 2,
4
+ "<pad>": 0,
5
+ "<s>": 1,
6
+ "<unk>": 3,
7
+ "A": 7,
8
+ "B": 24,
9
+ "C": 19,
10
+ "D": 14,
11
+ "E": 5,
12
+ "F": 20,
13
+ "G": 21,
14
+ "H": 11,
15
+ "I": 10,
16
+ "J": 29,
17
+ "K": 26,
18
+ "L": 15,
19
+ "M": 17,
20
+ "N": 9,
21
+ "O": 8,
22
+ "P": 23,
23
+ "Q": 30,
24
+ "R": 13,
25
+ "S": 12,
26
+ "T": 6,
27
+ "U": 16,
28
+ "V": 25,
29
+ "W": 18,
30
+ "X": 28,
31
+ "Y": 22,
32
+ "Z": 31,
33
+ "|": 4
34
+ }