pustozerov commited on
Commit
02dca0a
0 Parent(s):

Temporary remove samples.

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
2
+ *.ogg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /data/user_data/
2
+ /info/transcripts/
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/PoCCallTranscription.iml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/data/user_data" />
6
+ <excludeFolder url="file://$MODULE_DIR$/info/transcripts" />
7
+ </content>
8
+ <orderEntry type="inheritedJdk" />
9
+ <orderEntry type="sourceFolder" forTests="false" />
10
+ </component>
11
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="23">
8
+ <item index="0" class="java.lang.String" itemvalue="parselmouth" />
9
+ <item index="1" class="java.lang.String" itemvalue="torchvision" />
10
+ <item index="2" class="java.lang.String" itemvalue="opencv-python-headless" />
11
+ <item index="3" class="java.lang.String" itemvalue="pyannote-audio" />
12
+ <item index="4" class="java.lang.String" itemvalue="pyannote" />
13
+ <item index="5" class="java.lang.String" itemvalue="scipy" />
14
+ <item index="6" class="java.lang.String" itemvalue="nemo_toolkit" />
15
+ <item index="7" class="java.lang.String" itemvalue="scikit-learn" />
16
+ <item index="8" class="java.lang.String" itemvalue="scikit_learn" />
17
+ <item index="9" class="java.lang.String" itemvalue="aiogram" />
18
+ <item index="10" class="java.lang.String" itemvalue="wget" />
19
+ <item index="11" class="java.lang.String" itemvalue="sklearn" />
20
+ <item index="12" class="java.lang.String" itemvalue="nemo" />
21
+ <item index="13" class="java.lang.String" itemvalue="pydub" />
22
+ <item index="14" class="java.lang.String" itemvalue="numpy" />
23
+ <item index="15" class="java.lang.String" itemvalue="omegaconf" />
24
+ <item index="16" class="java.lang.String" itemvalue="pandas" />
25
+ <item index="17" class="java.lang.String" itemvalue="importlib" />
26
+ <item index="18" class="java.lang.String" itemvalue="spacy" />
27
+ <item index="19" class="java.lang.String" itemvalue="matplotlib" />
28
+ <item index="20" class="java.lang.String" itemvalue="librosa" />
29
+ <item index="21" class="java.lang.String" itemvalue="xgboost" />
30
+ <item index="22" class="java.lang.String" itemvalue="torchaudio" />
31
+ </list>
32
+ </value>
33
+ </option>
34
+ </inspection_tool>
35
+ <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
36
+ <option name="ignoredErrors">
37
+ <list>
38
+ <option value="E722" />
39
+ <option value="E402" />
40
+ </list>
41
+ </option>
42
+ </inspection_tool>
43
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
44
+ <option name="ignoredErrors">
45
+ <list>
46
+ <option value="N806" />
47
+ <option value="N803" />
48
+ </list>
49
+ </option>
50
+ </inspection_tool>
51
+ <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
52
+ <option name="ignoredIdentifiers">
53
+ <list>
54
+ <option value="parselmouth" />
55
+ <option value="tuple.mean" />
56
+ <option value="tuple.variance" />
57
+ <option value="tuple.minmax" />
58
+ <option value="tuple.kurtosis" />
59
+ <option value="tuple.skewness" />
60
+ <option value="list.__getitem__" />
61
+ <option value="numpy.core._multiarray_umath.ndarray.A" />
62
+ <option value="cv2.remap" />
63
+ <option value="cv2.INTER_LINEAR" />
64
+ <option value="cv2.initUndistortRectifyMap" />
65
+ <option value="cv2.getOptimalNewCameraMatrix" />
66
+ <option value="cv2.resize" />
67
+ <option value="modules.data_generator.svgs2ttf.fontforge" />
68
+ <option value="modules.data_generator.ttf2pngs.fontforge" />
69
+ <option value="os.sys" />
70
+ <option value="typing.TextIO.__getitem__" />
71
+ </list>
72
+ </option>
73
+ </inspection_tool>
74
+ </profile>
75
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/PoCCallTranscription.iml" filepath="$PROJECT_DIR$/.idea/PoCCallTranscription.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+ COPY . .
4
+
5
+ RUN apt-get -y update
6
+ RUN apt-get -y upgrade
7
+
8
+ # Install every package one after another to track time
9
+ RUN python -m pip install --upgrade pip
10
+ RUN pip install -r requirements.txt
11
+
12
+ CMD ["python", "./Interface.py"]
13
+ # Next commands are: docker build -t pustozerov/poc-call-transcription:1.0
Interface.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import random
3
+ import os
4
+ import soundfile as sf
5
+ import streamlit as st
6
+ from pydub import AudioSegment
7
+
8
+ from modules.diarization.nemo_diarization import diarization
9
+
10
+ st.title('Call Transcription demo')
11
+ st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
12
+ 'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
13
+ 'pickup a set of images from the built-in database or try uploading your own files.')
14
+
15
+
16
+ if st.button('Try random samples from the database'):
17
+ folder = "data/datasets/crema_d_diarization_chunks"
18
+ os.makedirs(folder, exist_ok=True)
19
+ list_all_audio = glob.glob("data/datasets/crema_d_diarization_chunks/*.wav")
20
+ chosen_files = sorted(random.sample(list_all_audio, 1))
21
+ file_name = os.path.basename(chosen_files[0]).split(".")[0]
22
+ audio_file = open(chosen_files[0], 'rb')
23
+ audio_bytes = audio_file.read()
24
+ st.audio(audio_bytes)
25
+ f = sf.SoundFile(chosen_files[0])
26
+ st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
27
+ result = diarization(chosen_files[0])
28
+ with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
29
+ transcript = f.read()
30
+ st.write("Transcription completed.")
31
+ st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
32
+ st.write("Sentences: %s" % len(result[file_name]["sentences"]))
33
+ st.write("Words: %s" % len(result[file_name]["words"]))
34
+ st.download_button(
35
+ label="Download audio transcript",
36
+ data=transcript,
37
+ file_name='transcript.txt',
38
+ mime='text/csv',
39
+ )
40
+
41
+ uploaded_file = st.file_uploader("Choose your recording with a speech",
42
+ accept_multiple_files=False, type=["mp3", "wav", "ogg"])
43
+ if uploaded_file is not None:
44
+ folder = "data/user_data/"
45
+ os.makedirs(folder, exist_ok=True)
46
+ for f in glob.glob(folder + '*'):
47
+ os.remove(f)
48
+ save_path = folder + uploaded_file.name
49
+ if ".mp3" in uploaded_file:
50
+ sound = AudioSegment.from_mp3(uploaded_file)
51
+ elif ".ogg" in uploaded_file:
52
+ sound = AudioSegment.from_ogg(uploaded_file)
53
+ else:
54
+ sound = AudioSegment.from_wav(uploaded_file)
55
+ sound.export(save_path, format="wav", parameters=["-ac", "1"])
56
+ file_name = os.path.basename(save_path).split(".")[0]
57
+ audio_file = open(save_path, 'rb')
58
+ audio_bytes = audio_file.read()
59
+ st.audio(audio_bytes)
60
+ f = sf.SoundFile(save_path)
61
+ st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds"
62
+ % ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60)))
63
+ result = diarization(save_path)
64
+ with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
65
+ transcript = f.read()
66
+ st.write("Transcription completed.")
67
+ st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
68
+ st.write("Sentences: %s" % len(result[file_name]["sentences"]))
69
+ st.write("Words: %s" % len(result[file_name]["words"]))
70
+ st.download_button(
71
+ label="Download audio transcript",
72
+ data=transcript,
73
+ file_name='transcript.txt',
74
+ mime='text/csv',
75
+ )
info/configs/manifests/external_vad_manifest.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0.9703125, "duration": 1.1475000000000004, "label": "SPEECH", "uniq_id": "26"}
2
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 3.2146875, "duration": 1.7381250000000001, "label": "SPEECH", "uniq_id": "26"}
3
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 5.340937500000001, "duration": 1.8899999999999988, "label": "SPEECH", "uniq_id": "26"}
4
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 8.4965625, "duration": 1.0293750000000017, "label": "SPEECH", "uniq_id": "26"}
5
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 10.454062500000003, "duration": 11.930625, "label": "SPEECH", "uniq_id": "26"}
6
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 23.0934375, "duration": 1.6537500000000023, "label": "SPEECH", "uniq_id": "26"}
7
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 26.9746875, "duration": 1.5187500000000007, "label": "SPEECH", "uniq_id": "26"}
8
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 29.7253125, "duration": 3.526875000000004, "label": "SPEECH", "uniq_id": "26"}
9
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 34.9228125, "duration": 1.2150000000000034, "label": "SPEECH", "uniq_id": "26"}
10
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 37.2009375, "duration": 1.4343750000000028, "label": "SPEECH", "uniq_id": "26"}
11
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 40.5084375, "duration": 1.8225000000000051, "label": "SPEECH", "uniq_id": "26"}
12
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 42.972187500000004, "duration": 3.628124999999997, "label": "SPEECH", "uniq_id": "26"}
13
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 47.7478125, "duration": 1.822499999999998, "label": "SPEECH", "uniq_id": "26"}
14
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 50.5153125, "duration": 1.9575000000000031, "label": "SPEECH", "uniq_id": "26"}
15
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 53.1478125, "duration": 1.7212500000000048, "label": "SPEECH", "uniq_id": "26"}
16
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 55.3246875, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
17
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 58.6153125, "duration": 1.7718750000000014, "label": "SPEECH", "uniq_id": "26"}
18
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 60.9271875, "duration": 1.8900000000000006, "label": "SPEECH", "uniq_id": "26"}
19
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 64.01531250000001, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
20
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 66.4115625, "duration": 1.991250000000008, "label": "SPEECH", "uniq_id": "26"}
info/configs/manifests/input_manifest.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": null, "uem_filepath": null}
info/configs/offline_diarization_asr.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ClusterDiarizer
2
+ num_workers: 0
3
+ sample_rate: 16000
4
+ batch_size: 64
5
+ diarizer:
6
+ manifest_filepath: ???
7
+ out_dir: ???
8
+ oracle_vad: false
9
+ collar: 0.25
10
+ ignore_overlap: true
11
+ vad:
12
+ model_path: null
13
+ external_vad_manifest: null
14
+ parameters:
15
+ window_length_in_sec: 0.15
16
+ shift_length_in_sec: 0.01
17
+ smoothing: median
18
+ overlap: 0.875
19
+ onset: 0.4
20
+ offset: 0.7
21
+ pad_onset: 0.05
22
+ pad_offset: -0.1
23
+ min_duration_on: 0.2
24
+ min_duration_off: 0.2
25
+ filter_speech_first: true
26
+ speaker_embeddings:
27
+ model_path: ???
28
+ parameters:
29
+ window_length_in_sec: 1.5
30
+ shift_length_in_sec: 0.75
31
+ multiscale_weights: null
32
+ save_embeddings: false
33
+ clustering:
34
+ parameters:
35
+ oracle_num_speakers: false
36
+ max_num_speakers: 20
37
+ enhanced_count_thres: 80
38
+ max_rp_threshold: 0.25
39
+ sparse_search_volume: 30
40
+ maj_vote_spk_count: false
41
+ asr:
42
+ model_path: ???
43
+ parameters:
44
+ asr_based_vad: false
45
+ asr_based_vad_threshold: 0.05
46
+ asr_batch_size: null
47
+ lenient_overlap_WDER: true
48
+ decoder_delay_in_sec: null
49
+ word_ts_anchor_offset: null
50
+ word_ts_anchor_pos: start
51
+ fix_word_ts_with_VAD: false
52
+ colored_text: false
53
+ print_time: true
54
+ break_lines: false
55
+ ctc_decoder_parameters:
56
+ pretrained_language_model: null
57
+ beam_width: 32
58
+ alpha: 0.5
59
+ beta: 2.5
60
+ realigning_lm_parameters:
61
+ arpa_language_model: null
62
+ min_number_of_words: 3
63
+ max_number_of_words: 10
64
+ logprob_diff_threshold: 1.2
modules/diarization/nemo_diarization.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASR_TIMESTAMPS
4
+ from nemo.collections.asr.parts.utils.diarization_utils import ASR_DIAR_OFFLINE
5
+ from omegaconf import OmegaConf
6
+ from pyannote.audio import Pipeline
7
+
8
+ ROOT = os.getcwd()
9
+ MODEL_CONFIG = "info/configs/offline_diarization_asr.yaml"
10
+ data_dir = os.path.join(ROOT, 'info/configs/')
11
+ os.makedirs(data_dir, exist_ok=True)
12
+ output_dir = os.path.join(ROOT, 'info/transcripts/')
13
+ os.makedirs(output_dir, exist_ok=True)
14
+
15
+
16
+ def diarization(file_path):
17
+ # Create a manifest for input with below format.
18
+ # {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',
19
+ # 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}
20
+ import json
21
+ meta = {
22
+ 'audio_filepath': file_path,
23
+ 'offset': 0,
24
+ 'duration': None,
25
+ 'label': 'infer',
26
+ 'text': '-',
27
+ 'num_speakers': 2,
28
+ 'rttm_filepath': None,
29
+ 'uem_filepath': None
30
+ }
31
+ with open(os.path.join(data_dir, 'manifests/', 'input_manifest.json'), 'w') as fp:
32
+ json.dump(meta, fp)
33
+ fp.write('\n')
34
+
35
+ # Make a manifest with an external VAD
36
+ pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
37
+ output = pipeline(file_path)
38
+ initial_json = output.for_json()
39
+ keys = ("audio_filepath", "offset", "duration", "label")
40
+ output_json = []
41
+ for segment in initial_json["content"]:
42
+ vad_json = dict.fromkeys(keys)
43
+ vad_json["audio_filepath"] = file_path
44
+ vad_json["offset"] = segment["segment"]["start"]
45
+ vad_json["duration"] = segment["segment"]["end"] - segment["segment"]["start"]
46
+ vad_json["label"] = "SPEECH"
47
+ vad_json["uniq_id"] = initial_json["uri"]
48
+ output_json.append(vad_json)
49
+ with open(os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json'), 'w') as f:
50
+ for item in output_json:
51
+ f.write(str(item).replace("'", '"') + '\n')
52
+
53
+ config2 = OmegaConf.load(MODEL_CONFIG)
54
+ config2.diarizer.asr.model_path = 'QuartzNet15x5Base-En'
55
+ config2.diarizer.manifest_filepath = \
56
+ os.path.join(data_dir, 'manifests/', 'input_manifest.json')
57
+ config2.diarizer.speaker_embeddings.model_path = 'titanet_large'
58
+ config2.diarizer.vad.external_vad_manifest = \
59
+ os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json')
60
+ config2.diarizer.out_dir = output_dir
61
+ config2.num_workers = 0
62
+ asr_ts_decoder = ASR_TIMESTAMPS(**config2.diarizer)
63
+ asr_model = asr_ts_decoder.set_asr_model()
64
+ word_hyp, word_ts_hyp = asr_ts_decoder.run_ASR(asr_model)
65
+ print(word_hyp)
66
+ print(word_ts_hyp)
67
+
68
+ asr_diar_offline = ASR_DIAR_OFFLINE(**config2.diarizer)
69
+ asr_diar_offline.word_ts_anchor_offset = asr_ts_decoder.word_ts_anchor_offset
70
+ diar_hyp, diar_score = asr_diar_offline.run_diarization(config2, word_ts_hyp)
71
+ print("Diarization hypothesis output: \n", diar_hyp)
72
+ result = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
73
+ file_to_show = os.path.join(data_dir, 'transcripts/pred_rttms/', file_path.split('/')[-1].split(".")[0], '.txt')
74
+ print(file_to_show)
75
+ print(diar_hyp)
76
+ return result
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ libsndfile1
2
+ ffmpeg
3
+ python3-pip
4
+ python3-dev
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ astroid
2
+ braceexpand==0.1.7
3
+ editdistance==0.6.0
4
+ einops==0.3.2
5
+ h5py==3.7.0
6
+ hydra-core==1.1.2
7
+ ijson==3.1.4
8
+ inflect==5.6.0
9
+ ipadic==1.0.0
10
+ ipython==8.4.0
11
+ jieba==0.42.1
12
+ kenlm @ https://github.com/kpu/kenlm/archive/master.zip
13
+ librosa==0.9.2
14
+ mecab-python3==1.0.5
15
+ nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
16
+ omegaconf==2.1.2
17
+ OpenCC==1.1.1
18
+ pangu==4.0.6.1
19
+ praat-parselmouth==0.4.1
20
+ protobuf==3.19.4
21
+ psutil==5.9.1
22
+ pyannote.audio @ https://github.com/pyannote/pyannote-audio/archive/develop.zip
23
+ pyannote.core==4.4
24
+ pyannote.database==4.1.3
25
+ pyannote.metrics==3.2
26
+ pyannote.pipeline==2.3
27
+ pyctcdecode==0.3.0
28
+ pydub==0.25.1
29
+ pytorch-lightning==1.6.5
30
+ sacrebleu==2.1.0
31
+ sacremoses==0.0.53
32
+ sentencepiece==0.1.96
33
+ SoundFile==0.10.3.post1
34
+ spacy==3.4.0
35
+ speechbrain @ git+https://github.com/speechbrain/speechbrain.git
36
+ streamlit==1.10.0
37
+ torch==1.12.0
38
+ torchaudio==0.12.0
39
+ transformers==4.20.0
40
+ webdataset==0.1.62
41
+ Cython==0.29.14
42
+ youtokentome