Whisper-faster (Speech2text)

https://github.com/Purfview/whisper-standalone-win

usage: whisper-faster.exe [-h] [--model MODEL] [--model_dir MODEL_DIR] [--device DEVICE] [--output_dir OUTPUT_DIR]
                          [--output_format {lrc,txt,text,vtt,srt,tsv,json,all}] [--verbose VERBOSE]
                          [--task {transcribe,translate}]
                          [--language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Mandarin,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}]
                          [--language_detection_threshold LANGUAGE_DETECTION_THRESHOLD]
                          [--language_detection_segments LANGUAGE_DETECTION_SEGMENTS] [--temperature TEMPERATURE]
                          [--best_of BEST_OF] [--beam_size BEAM_SIZE] [--patience PATIENCE]
                          [--length_penalty LENGTH_PENALTY] [--repetition_penalty REPETITION_PENALTY]
                          [--no_repeat_ngram_size NO_REPEAT_NGRAM_SIZE] [--suppress_blank SUPPRESS_BLANK]
                          [--suppress_tokens SUPPRESS_TOKENS] [--initial_prompt INITIAL_PROMPT] [--prefix PREFIX]
                          [--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT]
                          [--prompt_reset_on_temperature PROMPT_RESET_ON_TEMPERATURE]
                          [--without_timestamps WITHOUT_TIMESTAMPS] [--max_initial_timestamp MAX_INITIAL_TIMESTAMP]
                          [--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK]
                          [--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD]
                          [--logprob_threshold LOGPROB_THRESHOLD] [--no_speech_threshold NO_SPEECH_THRESHOLD]
                          [--v3_offsets_off] [--hallucination_silence_threshold HALLUCINATION_SILENCE_THRESHOLD]
                          [--hallucination_silence_th_temp {0.0,0.2,0.5,0.8,1.0}] [--clip_timestamps CLIP_TIMESTAMPS]
                          [--no_speech_strict_lvl {0,1,2}] [--word_timestamps WORD_TIMESTAMPS]
                          [--highlight_words HIGHLIGHT_WORDS] [--prepend_punctuations PREPEND_PUNCTUATIONS]
                          [--append_punctuations APPEND_PUNCTUATIONS] [--threads THREADS] [--version]
                          [--vad_filter VAD_FILTER] [--vad_threshold VAD_THRESHOLD]
                          [--vad_min_speech_duration_ms VAD_MIN_SPEECH_DURATION_MS]
                          [--vad_max_speech_duration_s VAD_MAX_SPEECH_DURATION_S]
                          [--vad_min_silence_duration_ms VAD_MIN_SILENCE_DURATION_MS]
                          [--vad_speech_pad_ms VAD_SPEECH_PAD_MS] [--vad_window_size_samples VAD_WINDOW_SIZE_SAMPLES]
                          [--vad_dump] [--max_new_tokens MAX_NEW_TOKENS] [--chunk_length CHUNK_LENGTH]
                          [--compute_type {default,auto,int8,int8_float16,int8_float32,int8_bfloat16,int16,float16,float32,bfloat16}]
                          [--batch_recursive] [--beep_off] [--skip] [--checkcuda] [--print_progress] [--postfix]
                          [--check_files] [--PR163_off] [--hallucinations_list_off] [--one_word {0,1,2}] [--sentence]
                          [--standard] [--standard_asia] [--max_comma MAX_COMMA]
                          [--max_comma_cent {50,60,70,80,90,100}] [--max_gap MAX_GAP]
                          [--max_line_width MAX_LINE_WIDTH] [--max_line_count MAX_LINE_COUNT]
                          [--min_dist_to_end {0,4,5,6,7,8,9,10,11,12}] [--prompt_max {16,32,64,128,223}]
                          [--reprompt {0,1,2}] [--prompt_reset_on_no_end {0,1,2}] [--ff_dump]
                          [--ff_track {1,2,3,4,5,6}] [--ff_fc] [--ff_mp3] [--ff_sync] [--ff_rnndn_sh]
                          [--ff_rnndn_xiph] [--ff_fftdn [0 - 97]] [--ff_tempo [0.5 - 2.0]] [--ff_gate]
                          [--ff_speechnorm] [--ff_loudnorm] [--ff_silence_suppress noise duration] [--ff_lowhighpass]
                          audio [audio ...]

Leave a comment