diff --git a/Jenkinsfile b/Jenkinsfile index 373017e25..e9cfcde12 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-29-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -94,22 +94,27 @@ pipeline { } } stage('L0: Create HI TN/ITN Grammars') { - when { + when { anyOf { - branch 'main' - changeRequest target: 'main' + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: Hi TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' + } } - } - failFast true - parallel { stage('L0: Hi ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language hi --text="बीस" --cache_dir ${HI_TN_CACHE}' - } + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' + } } - + } - } + } stage('L0: Create DE/ES TN/ITN Grammars') { when { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/measure/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/measure/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/money/__init__.py b/nemo_text_processing/inverse_text_normalization/hi/data/money/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/__init__.py b/nemo_text_processing/text_normalization/hi/__init__.py new file mode 100644 index 000000000..6cd1f01f4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.hi.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.hi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.hi.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/hi/data/__init__.py b/nemo_text_processing/text_normalization/hi/data/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/date/__init__.py b/nemo_text_processing/text_normalization/hi/data/date/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv new file mode 100644 index 000000000..b210f5207 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -0,0 +1,31 @@ +०१ एक +०२ दो +०३ तीन +०४ चार +०५ पाँच +०६ छः +०७ सात +०८ आठ +०९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv new file mode 100644 index 000000000..8b27041ac --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -0,0 +1,12 @@ +०१ जनवरी +०२ फ़रवरी +०३ मार्च +०४ अप्रैल +०५ मई +०६ जून +०७ जुलाई +०८ अगस्त +०९ सितंबर +१० अक्टूबर +११ नवंबर +१२ दिसंबर diff --git a/nemo_text_processing/text_normalization/hi/data/measure/__init__.py b/nemo_text_processing/text_normalization/hi/data/measure/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv new file mode 100644 index 000000000..ba453d332 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -0,0 +1,154 @@ +°C डिग्री सेल्सियस +°F डिग्री फारेनहाइट +K केल्विन +g ग्राम +kg किलोग्राम +mg मिलीग्राम +cg सेंटीग्राम +dg डेसीग्राम +Tg टेराग्राम +Mg मेगाग्राम +Gg गीगाग्राम +hg हेक्टोग्राम +dag डेकाग्राम +lb पाउंड +oz आउन्स +t टन +st स्टोन +q क्विंटल +m मीटर +cm सेंटीमीटर +mm मिलीमीटर +km किलोमीटर +dm डेसीमीटर +dam डेकामीटर +nm नैनोमीटर +hm हेक्टोमीटर +Mm मेगामीटर +my मिरियामीटर +mi मील +ft फीट +in इंच +yd यार्ड +µm माइक्रोमीटर +m² वर्ग मीटर +cm² वर्ग सेंटीमीटर +mm² वर्ग मिलीमीटर +km² वर्ग किलोमीटर +hm² वर्ग हेक्टोमीटर +dm² वर्ग डेसीमीटर +dam² वर्ग डेकामीटर +yd² वर्ग यार्ड +ft² वर्ग फीट +ac² वर्ग एकड़ +my² वर्ग मिरियामीटर +mi² वर्ग मील +nm² वर्ग नैनोमीटर +µm² वर्ग माइक्रोमीटर +in² वर्ग इंच +my³ घन मिरियामीटर +mi³ घन मील +ac³ घन एकड़ +ha हेक्टेयर +ac एकड़ +गज गज +गज² वर्ग गज +गज³ घन गज +m³ घन मीटर +cm³ घन सेंटीमीटर +mm³ घन मिलीमीटर +dam³ घन डेकामीटर +µm³ घन माइक्रोमीटर +ml³ घन मिलीलीटर +l लीटर +kl किलोलीटर +ml मिलीलीटर +ml² वर्ग मिलीलीटर +dl डेसीलीटर +hl हेक्टोलीटर +cl सेंटीलीटर +dal डेकालीटर +dl² वर्ग डेसीलीटर +dal² वर्ग डेकालीटर +dl³ घन डेसीलीटर +dal³ घन डेकालीटर +L लीटर +kL किलोलीटर +mL मिलीलीटर +mL² वर्ग मिलीलीटर +mL³ घन मिलीलीटर +dL डेसीलीटर +hL हेक्टोलीटर +cL सेंटीलीटर +daL डेकालीटर +dL² वर्ग डेसीलीटर +daL² वर्ग डेकालीटर +dL³ घन डेसीलीटर +daL³ घन डेकालीटर +GB गीगाबाइट +in³ घन इंच +ft³ घन फीट +yd³ घन यार्ड +my³ घन मिरियामीटर +mi³ घन मील +dm³ घन डेसीमीटर +dm³ घन डेसीमीटर +km³ घन किलोमीटर +nm³ घन नैनोमीटर +mm³ घन मिलीमीटर +qt क्वार्ट +gal गैलन +pt पिंट +W वाट +MW मेगावाट +KW किलोवाट +b बिट +Mb मेगाबिट +B बाइट +GB गीगाबाइट +KB किलोबाइट +TB टेराबाइट +MB मेगाबाइट +PB पेटाबाइट +EB एक्साबाइट +ZB जेटाबाइट +YB योटाबाइट +BB ब्रोन्टोबाइट +C कूलंब +V वोल्ट +Pa पास्कल +A ऐंपीयर +J जूल +s सेकंड +hr घंटा +h घंटे +min मिनट +ha हेक्टेयर +ha² वर्ग हेक्टेयर +Ω ओम +MΩ मेगाओम +doz दर्जन +Hz हर्ट्ज़ +GHz गीगाहर्ट्ज़ +KHz किलोहर्ट्ज़ +N न्यूटन +dB डेसीबल +yr साल +yr वर्ष +hp हॉर्सपॉवर +d दिन +month महीना +months महीने +ct कैरेट +pH पीएच +km/h किलोमीटर प्रति घंटा +km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा +mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा +mi/min मील प्रति मिनट +₹/ac रुपए प्रति एकड़ +x बाई +X बाई +* बाई +- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/__init__.py b/nemo_text_processing/text_normalization/hi/data/money/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv new file mode 100644 index 000000000..88633ec7c --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -0,0 +1,10 @@ +₹ रुपए +P पैसे +£ पाउंड +₩ वॉन +$ डॉलर +₺ लीरा +৳ टका +¥ येन +₦ नाइरा +€ यूरो diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/hi/data/numbers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/digit.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/digit.tsv new file mode 100644 index 000000000..2ab9af461 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/hundred.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/hundred.tsv new file mode 100644 index 000000000..049f029bc --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/hundred.tsv @@ -0,0 +1 @@ +१०० एक सौ diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv new file mode 100644 index 000000000..1d61c77b7 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -0,0 +1,90 @@ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +३२ बत्तीस +३३ तैंतीस +३४ चौंतीस +३५ पैंतीस +३६ छत्तीस +३७ सैंतीस +३८ अड़तीस +३९ उनतालीस +४० चालीस +४१ इकतालीस +४२ बयालीस +४३ तैंतालीस +४४ चौवालीस +४५ पैंतालीस +४६ छियालीस +४७ सैंतालीस +४८ अड़तालीस +४९ उनचास +५० पचास +५१ इक्यावन +५२ बावन +५३ तिरेपन +५४ चौवन +५५ पचपन +५६ छप्पन +५७ सत्तावन +५८ अट्ठावन +५९ उनसठ +६० साठ +६१ इकसठ +६२ बासठ +६३ तिरेसठ +६४ चौंसठ +६५ पैंसठ +६६ छियासठ +६७ सड़सठ +६८ अड़सठ +६९ उनहत्तर +७० सत्तर +७१ इकहत्तर +७२ बहत्तर +७३ तिहत्तर +७४ चौहत्तर +७५ पचहत्तर +७६ छिहत्तर +७७ सतहत्तर +७८ अठहत्तर +७९ उनासी +८० अस्सी +८१ इक्यासी +८२ बयासी +८३ तिरासी +८४ चौरासी +८५ पचासी +८६ छियासी +८७ सत्तासी +८८ अट्ठासी +८९ नवासी +९० नब्बे +९१ इक्यानबे +९२ बानबे +९३ तिरानबे +९४ चौरानबे +९५ पंचानबे +९६ छियानबे +९७ सत्तानबे +९८ अट्ठानबे +९९ निन्यानबे diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/thousands.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/thousands.tsv new file mode 100644 index 000000000..ab9889218 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/thousands.tsv @@ -0,0 +1,8 @@ +१००० हज़ार +१००००० लाख +१००००००० करोड़ +१००००००००० अरब +१००००००००००० खरब +१००००००००००००० नील +१००००००००००००००० पद्म +१००००००००००००००००० शंख diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/zero.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/zero.tsv new file mode 100644 index 000000000..0735899fe --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/numbers/zero.tsv @@ -0,0 +1 @@ +० शून्य diff --git a/nemo_text_processing/text_normalization/hi/data/time/__init__.py b/nemo_text_processing/text_normalization/hi/data/time/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv new file mode 100644 index 000000000..d5e85a784 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -0,0 +1,24 @@ +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस diff --git a/nemo_text_processing/text_normalization/hi/data/time/minutes.tsv b/nemo_text_processing/text_normalization/hi/data/time/minutes.tsv new file mode 100644 index 000000000..6689d8070 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/time/minutes.tsv @@ -0,0 +1,60 @@ +०१ एक +०२ दो +०३ तीन +०४ चार +०५ पाँच +०६ छह +०७ सात +०८ आठ +०९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +३२ बत्तीस +३३ तैंतीस +३४ चौंतीस +३५ पैंतीस +३६ छत्तीस +३७ सैंतीस +३८ अड़तीस +३९ उनतालीस +४० चालीस +४१ इकतालीस +४२ बयालीस +४३ तैंतालीस +४४ चौवालीस +४५ पैंतालीस +४६ छियालीस +४७ सैंतालीस +४८ अड़तालीस +४९ उनचास +५० पचास +५१ इक्यावन +५२ बावन +५३ तिरेपन +५४ चौवन +५५ पचपन +५६ छप्पन +५७ सत्तावन +५८ अट्ठावन +५९ उनसठ +६० साठ diff --git a/nemo_text_processing/text_normalization/hi/data/time/seconds.tsv b/nemo_text_processing/text_normalization/hi/data/time/seconds.tsv new file mode 100644 index 000000000..6689d8070 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/time/seconds.tsv @@ -0,0 +1,60 @@ +०१ एक +०२ दो +०३ तीन +०४ चार +०५ पाँच +०६ छह +०७ सात +०८ आठ +०९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +३२ बत्तीस +३३ तैंतीस +३४ चौंतीस +३५ पैंतीस +३६ छत्तीस +३७ सैंतीस +३८ अड़तीस +३९ उनतालीस +४० चालीस +४१ इकतालीस +४२ बयालीस +४३ तैंतालीस +४४ चौवालीस +४५ पैंतालीस +४६ छियालीस +४७ सैंतालीस +४८ अड़तालीस +४९ उनचास +५० पचास +५१ इक्यावन +५२ बावन +५३ तिरेपन +५४ चौवन +५५ पचपन +५६ छप्पन +५७ सत्तावन +५८ अट्ठावन +५९ उनसठ +६० साठ diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/hi/data/whitelist/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/abbreviations.tsv b/nemo_text_processing/text_normalization/hi/data/whitelist/abbreviations.tsv new file mode 100644 index 000000000..fb39da2a4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/abbreviations.tsv @@ -0,0 +1,7 @@ +डॉ. डॉक्टर +प्रो. प्रोफेसर +इं. इंजीनियर +ले. लेफ्टिनेंट +वै. वैज्ञानिक +कु. कुमारी +मा. मास्टर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py new file mode 100644 index 000000000..ced1b8949 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +NEMO_CHAR = utf8.VALID_UTF8_CHAR +NEMO_DIGIT = byte.DIGIT + +NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +NEMO_HI_ZERO = "०" +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = u"\u00A0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + + +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union(" ऋणात्मक ", " ऋणात्मक ").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/hi/taggers/__init__.py b/nemo_text_processing/text_normalization/hi/taggers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py new file mode 100644 index 000000000..fe3ad9a1d --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -0,0 +1,322 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } } + s + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) + teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + + def create_graph_suffix(digit_graph, suffix, zeros_counts): + zero = pynutil.add_weight(pynutil.delete("०"), -0.1) + if zeros_counts == 0: + return digit_graph + suffix + + return digit_graph + (zero ** zeros_counts) + suffix + + def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): + insert_space = pynutil.insert(" ") + zero = pynutil.add_weight(pynutil.delete("०"), -0.1) + if zeros_counts == 0: + return digit_graph + suffix + insert_space + sub_graph + + return digit_graph + suffix + (zero ** zeros_counts) + insert_space + sub_graph + + # Hundred graph + suffix_hundreds = pynutil.insert(" सौ") + graph_hundreds = create_graph_suffix(digit, suffix_hundreds, 2) + graph_hundreds |= create_larger_number_graph(digit, suffix_hundreds, 1, digit) + graph_hundreds |= create_larger_number_graph(digit, suffix_hundreds, 0, teens_ties) + graph_hundreds.optimize() + self.graph_hundreds = graph_hundreds + + # Transducer for eleven hundred -> 1100 or twenty one hundred eleven -> 2111 + graph_hundreds_as_thousand = create_graph_suffix(teens_and_ties, suffix_hundreds, 2) + graph_hundreds_as_thousand |= create_larger_number_graph(teens_and_ties, suffix_hundreds, 1, digit) + graph_hundreds_as_thousand |= create_larger_number_graph(teens_and_ties, suffix_hundreds, 0, teens_ties) + self.graph_hundreds_as_thousand = graph_hundreds_as_thousand + + # Thousands and Ten thousands graph + suffix_thousands = pynutil.insert(" हज़ार") + graph_thousands = create_graph_suffix(digit, suffix_thousands, 3) + graph_thousands |= create_larger_number_graph(digit, suffix_thousands, 2, digit) + graph_thousands |= create_larger_number_graph(digit, suffix_thousands, 1, teens_ties) + graph_thousands |= create_larger_number_graph(digit, suffix_thousands, 0, graph_hundreds) + graph_thousands.optimize() + self.graph_thousands = graph_thousands + + graph_ten_thousands = create_graph_suffix(teens_and_ties, suffix_thousands, 3) + graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 2, digit) + graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties) + graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds) + graph_ten_thousands.optimize() + + # Lakhs graph and ten lakhs graph + suffix_lakhs = pynutil.insert(" लाख") + graph_lakhs = create_graph_suffix(digit, suffix_lakhs, 5) + graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 4, digit) + graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 3, teens_ties) + graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 2, graph_hundreds) + graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands) + graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands) + graph_lakhs.optimize() + + graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5) + graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit) + graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 3, teens_ties) + graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 2, graph_hundreds) + graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands) + graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands) + graph_ten_lakhs.optimize() + + # Crores graph ten crores graph + suffix_crores = pynutil.insert(" करोड़") + graph_crores = create_graph_suffix(digit, suffix_crores, 7) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 6, digit) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 5, teens_ties) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 4, graph_hundreds) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 3, graph_thousands) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 2, graph_ten_thousands) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 1, graph_lakhs) + graph_crores |= create_larger_number_graph(digit, suffix_crores, 0, graph_ten_lakhs) + graph_crores.optimize() + + graph_ten_crores = create_graph_suffix(teens_and_ties, suffix_crores, 7) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 6, digit) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 5, teens_ties) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 4, graph_hundreds) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 3, graph_thousands) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 2, graph_ten_thousands) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 1, graph_lakhs) + graph_ten_crores |= create_larger_number_graph(teens_and_ties, suffix_crores, 0, graph_ten_lakhs) + graph_ten_crores.optimize() + + # Arabs graph and ten arabs graph + suffix_arabs = pynutil.insert(" अरब") + graph_arabs = create_graph_suffix(digit, suffix_arabs, 9) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 8, digit) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 7, teens_ties) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 6, graph_hundreds) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 5, graph_thousands) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 4, graph_ten_thousands) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 3, graph_lakhs) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 2, graph_ten_lakhs) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 1, graph_crores) + graph_arabs |= create_larger_number_graph(digit, suffix_arabs, 0, graph_ten_crores) + graph_arabs.optimize() + + graph_ten_arabs = create_graph_suffix(teens_and_ties, suffix_arabs, 9) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 8, digit) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 7, teens_ties) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 6, graph_hundreds) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 5, graph_thousands) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 4, graph_ten_thousands) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 3, graph_lakhs) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 2, graph_ten_lakhs) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 1, graph_crores) + graph_ten_arabs |= create_larger_number_graph(teens_and_ties, suffix_arabs, 0, graph_ten_crores) + graph_ten_arabs.optimize() + + # Kharabs graph and ten kharabs graph + suffix_kharabs = pynutil.insert(" खरब") + graph_kharabs = create_graph_suffix(digit, suffix_kharabs, 11) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 10, digit) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 9, teens_ties) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 8, graph_hundreds) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 7, graph_thousands) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 6, graph_ten_thousands) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 5, graph_lakhs) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 4, graph_ten_lakhs) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 3, graph_crores) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 2, graph_ten_crores) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 1, graph_arabs) + graph_kharabs |= create_larger_number_graph(digit, suffix_kharabs, 0, graph_ten_arabs) + graph_kharabs.optimize() + + graph_ten_kharabs = create_graph_suffix(teens_and_ties, suffix_kharabs, 11) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 10, digit) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 9, teens_ties) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 8, graph_hundreds) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 7, graph_thousands) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 6, graph_ten_thousands) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 5, graph_lakhs) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 4, graph_ten_lakhs) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 3, graph_crores) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 2, graph_ten_crores) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 1, graph_arabs) + graph_ten_kharabs |= create_larger_number_graph(teens_and_ties, suffix_kharabs, 0, graph_ten_arabs) + graph_ten_kharabs.optimize() + + # Nils graph and ten nils graph + suffix_nils = pynutil.insert(" नील") + graph_nils = create_graph_suffix(digit, suffix_nils, 13) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 12, digit) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 11, teens_ties) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 10, graph_hundreds) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 9, graph_thousands) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 8, graph_ten_thousands) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 7, graph_lakhs) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 6, graph_ten_lakhs) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 5, graph_crores) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 4, graph_ten_crores) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 3, graph_arabs) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 2, graph_ten_arabs) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 1, graph_kharabs) + graph_nils |= create_larger_number_graph(digit, suffix_nils, 0, graph_ten_kharabs) + graph_nils.optimize() + + graph_ten_nils = create_graph_suffix(teens_and_ties, suffix_nils, 13) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 12, digit) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 11, teens_ties) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 10, graph_hundreds) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 9, graph_thousands) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 8, graph_ten_thousands) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 7, graph_lakhs) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 6, graph_ten_lakhs) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 5, graph_crores) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 4, graph_ten_crores) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 3, graph_arabs) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 2, graph_ten_arabs) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 1, graph_kharabs) + graph_ten_nils |= create_larger_number_graph(teens_and_ties, suffix_nils, 0, graph_ten_kharabs) + graph_ten_nils.optimize() + + # Padmas graph and ten padmas graph + suffix_padmas = pynutil.insert(" पद्म") + graph_padmas = create_graph_suffix(digit, suffix_padmas, 15) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 14, digit) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 13, teens_ties) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 12, graph_hundreds) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 11, graph_thousands) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 10, graph_ten_thousands) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 9, graph_lakhs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 8, graph_ten_lakhs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 7, graph_crores) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 6, graph_ten_crores) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 5, graph_arabs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 4, graph_ten_arabs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 3, graph_kharabs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 2, graph_ten_kharabs) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 1, graph_nils) + graph_padmas |= create_larger_number_graph(digit, suffix_padmas, 0, graph_ten_nils) + graph_padmas.optimize() + + graph_ten_padmas = create_graph_suffix(teens_and_ties, suffix_padmas, 15) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 14, digit) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 13, teens_ties) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 12, graph_hundreds) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 11, graph_thousands) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 10, graph_ten_thousands) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 9, graph_lakhs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 8, graph_ten_lakhs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 7, graph_crores) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 6, graph_ten_crores) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 5, graph_arabs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 4, graph_ten_arabs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 3, graph_kharabs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 2, graph_ten_kharabs) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 1, graph_nils) + graph_ten_padmas |= create_larger_number_graph(teens_and_ties, suffix_padmas, 0, graph_ten_nils) + graph_ten_padmas.optimize() + + # Shankhs graph and ten shankhs graph + suffix_shankhs = pynutil.insert(" शंख") + graph_shankhs = create_graph_suffix(digit, suffix_shankhs, 17) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 16, digit) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 15, teens_ties) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 14, graph_hundreds) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 13, graph_thousands) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 12, graph_ten_thousands) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 11, graph_lakhs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 10, graph_ten_lakhs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 9, graph_crores) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 8, graph_ten_crores) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 7, graph_arabs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 6, graph_ten_arabs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 5, graph_kharabs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 4, graph_ten_kharabs) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 3, graph_nils) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 2, graph_ten_nils) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 1, graph_padmas) + graph_shankhs |= create_larger_number_graph(digit, suffix_shankhs, 0, graph_ten_padmas) + graph_shankhs.optimize() + + graph_ten_shankhs = create_graph_suffix(teens_and_ties, suffix_shankhs, 17) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 16, digit) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 15, teens_ties) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 14, graph_hundreds) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 13, graph_thousands) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 12, graph_ten_thousands) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 11, graph_lakhs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 10, graph_ten_lakhs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 9, graph_crores) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 8, graph_ten_crores) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 7, graph_arabs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 6, graph_ten_arabs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 5, graph_kharabs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 4, graph_ten_kharabs) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 3, graph_nils) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 2, graph_ten_nils) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 1, graph_padmas) + graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) + graph_ten_shankhs.optimize() + + final_graph = ( + digit + | zero + | teens_and_ties + | graph_hundreds + | graph_thousands + | graph_ten_thousands + | graph_lakhs + | graph_ten_lakhs + | graph_crores + | graph_ten_crores + | graph_arabs + | graph_ten_arabs + | graph_kharabs + | graph_ten_kharabs + | graph_nils + | graph_ten_nils + | graph_padmas + | graph_ten_padmas + | graph_shankhs + | graph_ten_shankhs + ) + + optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + + self.final_graph = final_graph.optimize() + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.final_graph + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py new file mode 100644 index 000000000..19aaf3139 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -0,0 +1,95 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_HI_DIGIT, + NEMO_HI_NON_ZERO, + NEMO_HI_ZERO, + GraphFst, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +days = pynini.string_file(get_abs_path("data/date/days.tsv")) +months = pynini.string_file(get_abs_path("data/date/months.tsv")) + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, e.g. + "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } + "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } + + + Args: + cardinal: cardinal GraphFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="date", kind="classify") + + graph_year_thousands = pynini.compose( + (NEMO_HI_DIGIT + NEMO_HI_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_thousands + ) + graph_year_hundreds_as_thousands = pynini.compose( + (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand + ) + + graph_year = graph_year_thousands | graph_year_hundreds_as_thousands + + delete_dash = pynutil.delete("-") + delete_slash = pynutil.delete("/") + + days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space + + months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + + years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + + graph_dd_mm = days_graph + delete_dash + months_graph + + graph_mm_dd = months_graph + delete_dash + days_graph + + graph_mm_dd += pynutil.insert(" preserve_order: true ") + + graph_dd_mm_yyyy = ( + days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph + ) + + graph_mm_dd_yyyy = ( + months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph + ) + + graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") + + graph_mm_yyyy = months_graph + delete_dash + years_graph + + # default assume dd_mm_yyyy + + final_graph = ( + pynutil.add_weight(graph_dd_mm, -0.001) + | graph_mm_dd + | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) + | graph_mm_dd_yyyy + | graph_mm_yyyy + ) + + self.final_graph = final_graph.optimize() + + self.fst = self.add_tokens(self.final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py new file mode 100644 index 000000000..d0bef9373 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +quantities = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) + + +def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': + + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, + e.g. १ लाख -> integer_part: "एक" quantity: "लाख" + e.g. १.५ लाख -> integer_part: "एक" fractional_part: "पाँच" quantity: "लाख" + + Args: + decimal: decimal FST + cardinal_up_to_hundred: cardinal FST + """ + numbers = cardinal_up_to_hundred + + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + insert_space + + pynutil.insert("quantity: \"") + + quantities + + pynutil.insert("\"") + ) + res |= decimal + insert_space + pynutil.insert("quantity: \"") + quantities + pynutil.insert("\"") + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal, e.g. + -१२.५००६ अरब -> decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } + १ अरब -> decimal { integer_part: "एक" quantity: "अरब" } + + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + cardinal_graph = cardinal.final_graph + + self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() + + point = pynutil.delete(".") + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + ) + + self.graph_fractional = pynutil.insert("fractional_part: \"") + self.graph + pynutil.insert("\"") + self.graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + + final_graph_wo_sign = self.graph_integer + point + insert_space + self.graph_fractional + + self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(final_graph_wo_sign, cardinal_graph) + + final_graph = optional_graph_negative + self.final_graph_wo_negative + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py new file mode 100644 index 000000000..a29a72666 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying fraction + "२३ ४/६" -> + fraction { integer: "तेईस" numerator: "चार" denominator: "छः"} + ४/६" -> + fraction { numerator: "चार" denominator: "छः"} + + + Args: + cardinal: cardinal GraphFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal, deterministic: bool = True): + super().__init__(name="fraction", kind="classify", deterministic=deterministic) + + cardinal_graph = cardinal.final_graph + + self.optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + ) + self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + self.numerator = ( + pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ") + ) + self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") + + self.graph = ( + self.optional_graph_negative + + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + + self.numerator + + self.denominator + ) + + graph = self.graph + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py new file mode 100644 index 000000000..7434fd70f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure, suppletive aware, e.g. + -१२kg -> measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } + -१२.२kg -> measure { decimal { negative: "true" integer_part: "बारह" fractional_part: "दो"} units: "किलोग्राम" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="measure", kind="classify") + + cardinal_graph = cardinal.final_graph + decimal_graph = decimal.final_graph_wo_negative + unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + ) + + # Define the unit handling + self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + + graph_measurements = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal_graph + + pynutil.insert(" }") + + delete_space + + self.unit + ) + graph_measurements |= ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_space + + self.unit + ) + + graph = graph_measurements + self.graph = graph.optimize() + + final_graph = self.add_tokens(graph) + self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py new file mode 100644 index 000000000..c44d6d346 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +currency_graph = pynini.string_file(get_abs_path("data/money/currency.tsv")) + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money, suppletive aware, e.g. + ₹1 -> money { currency: "रुपए" integer_part: "एक" } + ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="classify") + + cardinal_graph = cardinal.final_graph + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + ) + self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") + self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") + self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") + + graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger + graph_currencies |= ( + optional_graph_negative + + self.currency + + insert_space + + self.interger + + pynutil.delete(".") + + insert_space + + self.fraction + ) + graph = graph_currencies + self.graph = graph.optimize() + final_graph = self.add_tokens(graph) + self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py new file mode 100644 index 000000000..8309ba030 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py @@ -0,0 +1,48 @@ +import sys +from unicodedata import category + +import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transductions are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="punctuation", kind="classify", deterministic=deterministic) + s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" + + punct_symbols_to_exclude = ["[", "]"] + punct_unicode = [ + chr(i) + for i in range(sys.maxunicode) + if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude + ] + + self.punct_marks = [p for p in punct_unicode + list(s)] + + punct = pynini.union(*self.punct_marks) + punct = pynini.closure(punct, 1) + + emphasis = ( + pynini.accep("<") + + ( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) + | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + ) + + pynini.accep(">") + ) + punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) + + self.graph = punct + self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py new file mode 100644 index 000000000..622d4d5cb --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) +minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) +seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv")) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time, e.g. + १२:३०:३० -> time { hours: "बारह" minutes: "तीस" seconds: "तीस" } + १:४० -> time { hours: "एक" minutes: "चालीस" } + १:०० -> time { hours: "एक" } + + Args: + time: GraphFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + delete_colon = pynutil.delete(":") + + self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ") + self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ") + self.seconds = pynutil.insert("seconds: \"") + seconds_graph + pynutil.insert("\" ") + + # hour minute seconds + graph_hms = ( + self.hours + delete_colon + insert_space + self.minutes + delete_colon + insert_space + self.seconds + ) + + # hour minute + graph_hm = self.hours + delete_colon + insert_space + self.minutes + + # hour + graph_h = self.hours + delete_colon + pynutil.delete("००") + + final_graph = graph_hms | graph_hm | graph_h + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..48ee97ef3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -0,0 +1,168 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import time + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.taggers.date import DateFst +from nemo_text_processing.text_normalization.hi.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst +from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst +from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.taggers.word import WordFst + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str, + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + whitelist_file = os.path.basename(whitelist) if whitelist else "" + far_file = os.path.join( + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + ) + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f'ClassifyFst.fst was restored from {far_file}.') + else: + logging.info(f"Creating ClassifyFst grammars.") + + start_time = time.time() + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + logging.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") + + start_time = time.time() + decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) + decimal_graph = decimal.fst + logging.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes") + + start_time = time.time() + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + fraction_graph = fraction.fst + logging.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") + + start_time = time.time() + date = DateFst(cardinal=cardinal) + date_graph = date.fst + logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") + + start_time = time.time() + timefst = TimeFst() + time_graph = timefst.fst + logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") + + start_time = time.time() + measure = MeasureFst(cardinal=cardinal, decimal=decimal) + measure_graph = measure.fst + logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") + + start_time = time.time() + money = MoneyFst(cardinal=cardinal, decimal=decimal) + money_graph = money.fst + logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") + + start_time = time.time() + whitelist_graph = WhiteListFst( + input_case=input_case, deterministic=deterministic, input_file=whitelist + ).fst + logging.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") + + start_time = time.time() + punctuation = PunctuationFst(deterministic=deterministic) + punct_graph = punctuation.fst + logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(date_graph, 1.1) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + ) + + start_time = time.time() + word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst + logging.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") + punct = pynini.closure( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) + | (pynutil.insert(" ") + punct), + 1, + ) + + classify |= pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure( + ( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) + | (pynutil.insert(" ") + punct + pynutil.insert(" ")) + ) + + token_plus_punct + ) + + graph = delete_space + graph + delete_space + graph |= punct + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/text_normalization/hi/taggers/whitelist.py new file mode 100644 index 000000000..93505f9f7 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/whitelist.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_UPPER, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.hi.utils import ( + augment_labels_with_punct_at_end, + get_abs_path, + load_labels, +) + + +class WhiteListFst(GraphFst): + def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + super().__init__(name="whitelist", kind="classify", deterministic=deterministic) + + def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): + whitelist = load_labels(file) + if input_case == INPUT_LOWER_CASED: + whitelist = [[x.lower(), y] for x, y in whitelist] + else: + whitelist = [[x, y] for x, y in whitelist] + + if keep_punct_add_end: + whitelist.extend(augment_labels_with_punct_at_end(whitelist)) + + graph = pynini.string_map(whitelist) + return graph + + graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/abbreviations.tsv")) + + if deterministic: + graph |= graph.optimize() + else: + graph |= _get_whitelist_graph( + input_case, get_abs_path("data/whitelist/abbreviations.tsv"), keep_punct_add_end=True + ) + + for x in [".", ". "]: + graph |= ( + NEMO_UPPER + + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + + pynini.closure(pynutil.delete("."), 0, 1) + ) + + if input_file: + whitelist_provided = _get_whitelist_graph(input_case, input_file) + if not deterministic: + graph |= whitelist_provided + else: + graph = whitelist_provided + + self.graph = (convert_space(graph)).optimize() + + self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py new file mode 100644 index 000000000..bc354232b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_NOT_SPACE, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying Hindi words. + e.g. सोना -> tokens { name: "सोना" } + + Args: + punctuation: PunctuationFst + deterministic: if True will provide a single transduction option, + for False multiple transductions are generated (used for audio-based normalization) + """ + + def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): + super().__init__(name="word", kind="classify", deterministic=deterministic) + + # Define Hindi characters and symbols using pynini.union + HINDI_CHAR = pynini.union( + *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants + *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters + *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics + *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits + ).optimize() + + # Include punctuation in the graph + punct = punctuation.graph + default_graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) + symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | punct).optimize() + + # Use HINDI_CHAR in the graph + graph = pynini.closure(pynini.difference(HINDI_CHAR, symbols_to_exclude), 1) + graph = pynutil.add_weight(graph, MIN_NEG_WEIGHT) | default_graph + + # Ensure no spaces around punctuation + graph = pynini.closure(graph + pynini.closure(punct + graph, 0, 1)) + + self.graph = convert_space(graph) + self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/hi/utils.py b/nemo_text_processing/text_normalization/hi/utils.py new file mode 100644 index 000000000..102212183 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/utils.py @@ -0,0 +1,74 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os +import pynini + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + label_tsv = open(abs_path, encoding="utf-8") + labels = list(csv.reader(label_tsv, delimiter="\t")) + label_tsv.close() + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res + + +def apply_fst(text, fst): + """ Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/hi/verbalizers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py new file mode 100644 index 000000000..9882aa4cf --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> minus twenty three + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "minus ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "negative ") + self.optional_sign |= pynini.cross("negative: \"true\"", "dash ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + self.numbers = self.optional_sign + integer + delete_tokens = self.delete_tokens(self.numbers) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py new file mode 100644 index 000000000..1265fcec6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } -> "एक अप्रैल दो हज़ार चौबीस" + date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } -> "अप्रैल एक दो हज़ार चौबीस" + + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + graph_dd_mm = day + NEMO_SPACE + month + + graph_mm_dd = month + NEMO_SPACE + day + + graph_dd_mm_yyyy = day + NEMO_SPACE + month + NEMO_SPACE + year + + graph_mm_dd_yyyy = month + NEMO_SPACE + day + NEMO_SPACE + year + + graph_mm_yyyy = month + NEMO_SPACE + year + + optional_preserve_order = pynini.closure( + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + | pynutil.delete("field_order:") + + delete_space + + pynutil.delete("\"") + + NEMO_NOT_QUOTE + + pynutil.delete("\"") + + delete_space + ) + + self.graph = ( + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + + delete_space + + optional_preserve_order + ) + + final_graph = self.graph + + delete_tokens = self.delete_tokens(final_graph) + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py new file mode 100644 index 000000000..57ec38003 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/decimal.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import MINUS, NEMO_NOT_QUOTE, GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.taggers.decimal import quantities + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal, e.g. + decimal { negative: "true" integer_part: "बारह" fractional_part: "पाँच शून्य शून्य छह" quantity: "अरब" } -> ऋणात्मक बारह दशमलव पाँच शून्य शून्य छह + decimal { integer_part: "बारह" quantity: "billion" } -> बारह अरब + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + delete_space = pynutil.delete(" ") + self.optional_sign = pynini.closure(pynini.cross("negative: \"true\"", MINUS) + delete_space, 0, 1) + self.integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + self.fractional_default = ( + pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + + self.fractional = pynutil.insert(" दशमलव ") + self.fractional_default + + self.quantity = ( + delete_space + insert_space + pynutil.delete("quantity: \"") + quantities + pynutil.delete("\"") + ) + self.optional_quantity = pynini.closure(self.quantity, 0, 1) + + graph = self.optional_sign + ( + self.integer + self.quantity | self.integer + delete_space + self.fractional + self.optional_quantity + ) + + self.numbers = graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py new file mode 100644 index 000000000..e4cfae302 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import MINUS, NEMO_NOT_QUOTE, GraphFst, insert_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing fraction + e.g. fraction { integer: "तेईस" numerator: "चार" denominator: "छः" }-> तेईस चार बटा छः + e.g. fraction { numerator: "चार" denominator: "छः" } -> चार बटा छः + + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) + + optional_sign = pynini.closure(pynini.cross("negative: \"true\"", MINUS) + pynutil.delete(" "), 0, 1) + + integer = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") + numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") + denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + insert_bata = pynutil.insert(" बटा ") + + fraction_default = numerator + insert_bata + denominator + + self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space) + fraction_default + + graph = self.graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py new file mode 100644 index 000000000..6cc6f8879 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py @@ -0,0 +1,67 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, e.g. + measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -> ऋणात्मक बारह किलोग्राम + measure { decimal { integer_part: "बारह" fractional_part: "दो" } units: "किलोग्राम" } -> बारह दशमलव दो किलोग्राम + + + Args: + decimal: DecimalFst + cardinal: CardinalFs + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="measure", kind="verbalize") + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + ) + + unit = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space + + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_graph_negative + + delete_space + + decimal.numbers + + delete_space + + pynutil.delete("}") + ) + + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + optional_graph_negative + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) + + graph = (graph_cardinal | graph_decimal) + delete_space + insert_space + unit + self.decimal = graph_decimal + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py new file mode 100644 index 000000000..d5cab33d8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class MoneyFst(GraphFst): + """ + Finite state transducer for verbalizing money, e.g. + money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + + Args: + cardinal: CardinalFst + decimal: DecimalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="verbalize") + + insert_paise = pynutil.insert("पैसे") + + currency = ( + pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space + ) + + integer_part = ( + pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space + ) + + fractional_part = ( + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('" ') + + insert_space + ) + + graph_integer = integer_part + delete_space + currency + + graph_interger_fraction = ( + integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise + ) + + graph = graph_integer | graph_interger_fraction + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py new file mode 100644 index 000000000..87ec8e389 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_SPACE, + NEMO_SIGMA, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + """ + Finite state transducer that post-processing an entire sentence after verbalization is complete, e.g. + removes extra spaces around punctuation marks " ( one hundred and twenty three ) " -> "(one hundred and twenty three)" + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "hi_tn_post_processing.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f'Post processing graph was restored from {far_file}.') + else: + self.set_punct_dict() + self.fst = self.get_punct_postprocess_graph() + + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def set_punct_dict(self): + self.punct_marks = { + "'": [ + "'", + '´', + 'ʹ', + 'ʻ', + 'ʼ', + 'ʽ', + 'ʾ', + 'ˈ', + 'ˊ', + 'ˋ', + '˴', + 'ʹ', + '΄', + '՚', + '՝', + 'י', + '׳', + 'ߴ', + 'ߵ', + 'ᑊ', + 'ᛌ', + '᾽', + '᾿', + '`', + '´', + '῾', + '‘', + '’', + '‛', + '′', + '‵', + 'ꞌ', + ''', + '`', + '𖽑', + '𖽒', + ], + } + + def get_punct_postprocess_graph(self): + """ + Returns graph to post process punctuation marks. + + {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. + By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + """ + + remove_space_around_single_quote = pynini.cdrewrite( + delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) + ) + # this works if spaces in between (good) + # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA + + graph = remove_space_around_single_quote.optimize() + + return graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py new file mode 100644 index 000000000..da10df4a0 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time, e.g. + time { hours: "बारह" minutes: "दस" seconds: "दस" } -> बारह बजकर दस मिनट दस सेकंड + time { hours: "सात" minutes: "चालीस"" } -> सात बजकर चालीस मिनट + time { hours: "दस" } -> दस बजे + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + + minute = ( + pynutil.delete("minutes: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + ) + + second = ( + pynutil.delete("seconds: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space + ) + + insert_minute = pynutil.insert("मिनट") + insert_second = pynutil.insert("सेकंड") + insert_bajkar = pynutil.insert("बजकर") + insert_baje = pynutil.insert("बजे") + + # hour minute second + graph_hms = ( + hour + + delete_space + + insert_bajkar + + insert_space + + minute + + delete_space + + insert_minute + + insert_space + + second + + delete_space + + insert_second + ) + + # hour minute + graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute + + # hour + graph_h = hour + delete_space + insert_baje + + self.graph = graph_hms | graph_hm | graph_h + + final_graph = self.graph + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py new file mode 100644 index 000000000..ca06fc9c3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -0,0 +1,66 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.verbalizers.date import DateFst +from nemo_text_processing.text_normalization.hi.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst +from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst + +# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + + decimal = DecimalFst(deterministic=deterministic) + decimal_graph = decimal.fst + + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + fraction_graph = fraction.fst + + date = DateFst() + date_graph = date.fst + + time = TimeFst() + time_graph = time.fst + + measure = MeasureFst(cardinal=cardinal, decimal=decimal) + measure_graph = measure.fst + + money = MoneyFst(cardinal=cardinal, decimal=decimal) + money_graph = money.fst + + # whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + + self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize_final.py new file mode 100644 index 000000000..948c820b9 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize_final.py @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.hi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.hi.verbalizers.word import WordFst + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"en_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logging.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst + types = verbalize | word + + if deterministic: + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + else: + graph = delete_space + types + delete_space + + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) + logging.info(f"VerbalizeFinalFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py new file mode 100644 index 000000000..3f478a2d2 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "misses" } } -> misses + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/word.py b/nemo_text_processing/text_normalization/hi/verbalizers/word.py new file mode 100644 index 000000000..30a6d4478 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/word.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing Hindi words. + e.g. tokens { name: "सोना" } -> सोना + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + chars = pynini.closure(NEMO_CHAR - " ", 1) + punct = pynini.union("!", "?", ".", ",", "-", ":", ";", "।") # Add other punctuation marks as needed + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + + # Ensure no spaces around punctuation + graph = char + pynini.closure(delete_space + punct, 0, 1) + + # Explicitly remove spaces before punctuation + remove_space_before_punct = pynini.cdrewrite(pynini.cross(" ", ""), "", punct, NEMO_SIGMA) + graph = graph @ remove_space_before_punct + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index cb1e0ad79..1a38dde3d 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -159,6 +159,9 @@ def __init__( elif lang == 'ar': from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ar.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'hi': + from nemo_text_processing.text_normalization.hi.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.hi.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'it': from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.it.verbalizers.verbalize_final import VerbalizeFinalFst @@ -717,7 +720,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 5602a2985..0438579a7 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], default="en", type=str, ) @@ -64,7 +64,8 @@ def parse_args(): normalizer = Normalizer(input_case=args.input_case, lang=args.lang) print("Loading training data: " + file_path) - training_data = load_files([file_path]) + to_lower = args.input_case == "lower_cased" + training_data = load_files([file_path], to_lower=to_lower) if args.filter: training_data = filter_loaded_data(training_data) @@ -74,6 +75,9 @@ def parse_args(): sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data) print("- Data: " + str(len(sentences_normalized)) + " sentences") sentences_prediction = normalizer.normalize_list(sentences_un_normalized) + with open('result.log', 'w') as ofp: + for inp, out in zip(sentences_normalized, sentences_prediction): + ofp.write(f'{inp==out}; {inp}\t{out}\n') print("- Normalized. Evaluating...") sentences_accuracy = evaluate( preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..6ba21de69 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,145 @@ +४ चौके~चार चौके +६ खिलाड़ी आउट~छह खिलाड़ी आउट +वनप्लस ८ प्रो~वनप्लस आठ प्रो +५ चार्जर~पाँच चार्जर +४ ओवर में १७ रन~चार ओवर में सत्रह रन +५ चॉकलेट्स ९ टॉफ़िज़~पाँच चॉकलेट्स नौ टॉफ़िज़ +१००९९~दस हज़ार निन्यानबे +१००००१~एक लाख एक +४ छक्के १४ चौके~चार छक्के चौदह चौके +६ रन बनाए~छह रन बनाए +३ गोल मारे~तीन गोल मारे +६ रन बनाए~छह रन बनाए +३ गोल मारे~तीन गोल मारे +५ चौके~पाँच चौके +२ छक्के १२ रन~दो छक्के बारह रन +९ पॉइंट्स~नौ पॉइंट्स +४ मृत १८ घायल~चार मृत अठारह घायल +५ गोल मार~पाँच गोल मार +३ बैट्समैन १२ खिलाड़ी~तीन बैट्समैन बारह खिलाड़ी +५ हार १ ड्रॉ १७ जीत~पाँच हार एक ड्रॉ सत्रह जीत +५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +पाठ १० श्लोक २~पाठ दस श्लोक दो +१०१~एक सौ एक +१०२~एक सौ दो +१०३~एक सौ तीन +१०४~एक सौ चार +१०५~एक सौ पाँच +१०६~एक सौ छह +१०७~एक सौ सात +१०८~एक सौ आठ +१०९~एक सौ नौ +११०~एक सौ दस +१११~एक सौ ग्यारह +११२~एक सौ बारह +११३~एक सौ तेरह +११४~एक सौ चौदह +११५~एक सौ पंद्रह +११६~एक सौ सोलह +८१७~आठ सौ सत्रह +८१८~आठ सौ अठारह +८१९~आठ सौ उन्नीस +८२०~आठ सौ बीस +८२१~आठ सौ इक्कीस +८२२~आठ सौ बाईस +८२३~आठ सौ तेईस +८२४~आठ सौ चौबीस +८२५~आठ सौ पच्चीस +८२६~आठ सौ छब्बीस +८२७~आठ सौ सत्ताईस +८२८~आठ सौ अट्ठाईस +८२९~आठ सौ उनतीस +८३०~आठ सौ तीस +८३१~आठ सौ इकतीस +८३२~आठ सौ बत्तीस +८३३~आठ सौ तैंतीस +८३४~आठ सौ चौंतीस +८३५~आठ सौ पैंतीस +८३६~आठ सौ छत्तीस +७३७~सात सौ सैंतीस +७३८~सात सौ अड़तीस +७३९~सात सौ उनतालीस +७४०~सात सौ चालीस +७४१~सात सौ इकतालीस +७४२~सात सौ बयालीस +७४३~सात सौ तैंतालीस +७४४~सात सौ चौवालीस +७४५~सात सौ पैंतालीस +४४६~चार सौ छियालीस +४४७~चार सौ सैंतालीस +४४८~चार सौ अड़तालीस +४४९~चार सौ उनचास +४५०~चार सौ पचास +४६१~चार सौ इकसठ +१७५~एक सौ पचहत्तर +१८१~एक सौ इक्यासी +१९०~एक सौ नब्बे +१९१~एक सौ इक्यानबे +१९९~एक सौ निन्यानबे +१००१~एक हज़ार एक +१०९९~एक हज़ार निन्यानबे +५५५१ केले~पाँच हज़ार पाँच सौ इक्यावन केले +५५५५२ सेब~पचपन हज़ार पाँच सौ बावन सेब +५३~तिरेपन +५४~चौवन +५५~पचपन +५६~छप्पन +५७~सत्तावन +५८~अट्ठावन +५९~उनसठ +६०~साठ +६१~इकसठ +६२~बासठ +६३~तिरेसठ +६४~चौंसठ +६५~पैंसठ +६६~छियासठ +६७~सड़सठ +६८~अड़सठ +६९~उनहत्तर +७०~सत्तर +७१~इकहत्तर +७२~बहत्तर +७३~तिहत्तर +७४~चौहत्तर +७५~पचहत्तर +७६~छिहत्तर +७७~सतहत्तर +७८~अठहत्तर +७९~उनासी +८०~अस्सी +८१~इक्यासी +८२~बयासी +८३~तिरासी +८४~चौरासी +८५~पचासी +८६~छियासी +८७~सत्तासी +८८~अट्ठासी +८९~नवासी +९०~नब्बे +९१~इक्यानबे +९२~बानबे +९३~तिरानबे +९४~चौरानबे +९५~पंचानबे +९६~छियानबे +९७~सत्तानबे +९८~अट्ठानबे +९९~निन्यानबे +१३२३~एक हज़ार तीन सौ तेईस +१३४५~एक हज़ार तीन सौ पैंतालीस +१३४५६~तेरह हज़ार चार सौ छप्पन +१२३४६~बारह हज़ार तीन सौ छियालीस +१२३४५६~एक लाख तेईस हज़ार चार सौ छप्पन +८७२९८७~आठ लाख बहत्तर हज़ार नौ सौ सत्तासी +९८७६०९~नौ लाख सत्तासी हज़ार छह सौ नौ +९८७६७८९~अट्ठानबे लाख छिहत्तर हज़ार सात सौ नवासी +२३४५५६७~तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +१२३४५५६७~एक करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +१२१२१२१२~एक करोड़ इक्कीस लाख इक्कीस हज़ार दो सौ बारह +११२२३४५५६७~एक अरब बारह करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +१०२२३४५५६७~एक अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +११०२२३४५५६७~ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ +२ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..d92a53852 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -0,0 +1,19 @@ +०६-०५~छः मई +३१-०६~इकतीस जून +०२-०१~दो जनवरी +०४-०१~चार जनवरी +०१-१०~एक अक्टूबर +१२-०७~बारह जुलाई +०२-२७~फ़रवरी सत्ताईस +०४-०३~चार मार्च +२५-०३-२०२०~पच्चीस मार्च दो हज़ार बीस +३०-०५-२०७०~तीस मई दो हज़ार सत्तर +१२-०७-१९७०~बारह जुलाई उन्नीस सौ सत्तर +०९-१२-२१०१~नौ दिसंबर इक्कीस सौ एक +२३-०८-२०२४~तेईस अगस्त दो हज़ार चौबीस +१०-२९-२०००~अक्टूबर उनतीस दो हज़ार +११-१४-११००~नवंबर चौदह ग्यारह सौ +०३-२०१०~मार्च दो हज़ार दस +११-२०२४~नवंबर दो हज़ार चौबीस +२०७०~दो हज़ार सत्तर +२०२४~दो हज़ार चौबीस diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..3ec53dd4b --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_decimal.txt @@ -0,0 +1,20 @@ +९९.९९~निन्यानबे दशमलव नौ नौ +९७.०~सत्तानबे दशमलव शून्य +२५६३.४१२~दो हज़ार पाँच सौ तिरेसठ दशमलव चार एक दो +७२८६०.७०~बहत्तर हज़ार आठ सौ साठ दशमलव सात शून्य +०.००८~शून्य दशमलव शून्य शून्य आठ +०.०००३~शून्य दशमलव शून्य शून्य शून्य तीन +४०.०~चालीस दशमलव शून्य +८०.०~अस्सी दशमलव शून्य +१५००.२२~एक हज़ार पाँच सौ दशमलव दो दो +५०००.१२३५६~पाँच हज़ार दशमलव एक दो तीन पाँच छह +१०००.३१~एक हज़ार दशमलव तीन एक +५१४६.१७~पाँच हज़ार एक सौ छियालीस दशमलव एक सात +१००००.९९९~दस हज़ार दशमलव नौ नौ नौ +१०००००.१७~एक लाख दशमलव एक सात +१०००००००.३१~एक करोड़ दशमलव तीन एक +१०००००००००.२२~एक अरब दशमलव दो दो +१०००००००००००.७०~एक खरब दशमलव सात शून्य +१०००००००००००००.०००३~एक नील दशमलव शून्य शून्य शून्य तीन +१०००००००००००००००.००८~एक पद्म दशमलव शून्य शून्य आठ +१०००००००००००००००००.४१२~एक शंख दशमलव चार एक दो diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..25c18b777 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,21 @@ +९९/९९~निन्यानबे बटा निन्यानबे +२२ ३१/१७~बाईस इकतीस बटा सत्रह +९७/०~सत्तानबे बटा शून्य +२५६३/४१२~दो हज़ार पाँच सौ तिरेसठ बटा चार सौ बारह +७२८६०/७०~बहत्तर हज़ार आठ सौ साठ बटा सत्तर +०/८~शून्य बटा आठ +३/०~तीन बटा शून्य +४०/०~चालीस बटा शून्य +८०/०~अस्सी बटा शून्य +१५००/२२~एक हज़ार पाँच सौ बटा बाईस +५०००/१२३५६~पाँच हज़ार बटा बारह हज़ार तीन सौ छप्पन +१०००/३१~एक हज़ार बटा इकतीस +५१४६/१७~पाँच हज़ार एक सौ छियालीस बटा सत्रह +१००००/९९९~दस हज़ार बटा नौ सौ निन्यानबे +१०००००/१७~एक लाख बटा सत्रह +१०००००००/३१~एक करोड़ बटा इकतीस +१०००००००००/२२~एक अरब बटा बाईस +१०००००००००००/७०~एक खरब बटा सत्तर +१०००००००००००००/३~एक नील बटा तीन +१०००००००००००००००/८~एक पद्म बटा आठ +१०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..453369f82 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,62 @@ +१९ m²~उन्नीस वर्ग मीटर +१२.१९ m²~बारह दशमलव एक नौ वर्ग मीटर +२० km²~बीस वर्ग किलोमीटर +२०.७ km²~बीस दशमलव सात वर्ग किलोमीटर +२ ha~दो हेक्टेयर +२.७ ha~दो दशमलव सात हेक्टेयर +१ ac~एक एकड़ +३.८ ac~तीन दशमलव आठ एकड़ +४ m~चार मीटर +४.२ m~चार दशमलव दो मीटर +१८ mi~अठारह मील +१८.५५ mi~अठारह दशमलव पाँच पाँच मील +३४ in~चौंतीस इंच +३४.२ in~चौंतीस दशमलव दो इंच +४० ft~चालीस फीट +४०.३ ft~चालीस दशमलव तीन फीट +३९ yd~उनतालीस यार्ड +३९.१८ yd~उनतालीस दशमलव एक आठ यार्ड +३५ µm~पैंतीस माइक्रोमीटर +३५.३ µm~पैंतीस दशमलव तीन माइक्रोमीटर +५ km/hr~पाँच किलोमीटर प्रति घंटा +५.३५ km/hr~पाँच दशमलव तीन पाँच किलोमीटर प्रति घंटा +३ mi/hr~तीन मील प्रति घंटा +३.५ mi/hr~तीन दशमलव पाँच मील प्रति घंटा +२५ °C~पच्चीस डिग्री सेल्सियस +२५.४ °C~पच्चीस दशमलव चार डिग्री सेल्सियस +२२ °F~बाईस डिग्री फारेनहाइट +२२.५ °F~बाईस दशमलव पाँच डिग्री फारेनहाइट +७ K~सात केल्विन +७.२२ K~सात दशमलव दो दो केल्विन +५ L~पाँच लीटर +५.४ L~पाँच दशमलव चार लीटर +५० ml~पचास मिलीलीटर +५०.५ ml~पचास दशमलव पाँच मिलीलीटर +१९ qt~उन्नीस क्वार्ट +१९.७ qt~उन्नीस दशमलव सात क्वार्ट +५ gal~पाँच गैलन +५.७ gal~पाँच दशमलव सात गैलन +७६ pt~छिहत्तर पिंट +७६.८८ pt~छिहत्तर दशमलव आठ आठ पिंट +७७ g~सतहत्तर ग्राम +७७.१९ g~सतहत्तर दशमलव एक नौ ग्राम +५ kg~पाँच किलोग्राम +५.६ kg~पाँच दशमलव छह किलोग्राम +५० kg~पचास किलोग्राम +५०.५ kg~पचास दशमलव पाँच किलोग्राम +९० mg~नब्बे मिलीग्राम +९०.७ mg~नब्बे दशमलव सात मिलीग्राम +८२ cg~बयासी सेंटीग्राम +८२.५ cg~बयासी दशमलव पाँच सेंटीग्राम +९७ dg~सत्तानबे डेसीग्राम +९७.७७ dg~सत्तानबे दशमलव सात सात डेसीग्राम +६५ t~पैंसठ टन +६५.६ t~पैंसठ दशमलव छह टन +८८ st~अट्ठासी स्टोन +८८.५ st~अट्ठासी दशमलव पाँच स्टोन +९३ lb~तिरानबे पाउंड +९३.४ lb~तिरानबे दशमलव चार पाउंड +९९ oz~निन्यानबे आउन्स +९९.५ oz~निन्यानबे दशमलव पाँच आउन्स +८५ q~पचासी क्विंटल +८५.९९ q~पचासी दशमलव नौ नौ क्विंटल diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..c7b32628b --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -0,0 +1,100 @@ +₹१११~एक सौ ग्यारह रुपए +£१८००~एक हज़ार आठ सौ पाउंड +₩७६०~सात सौ साठ वॉन +$५००~पाँच सौ डॉलर +₹२२~बाईस रुपए +₺५६००~पाँच हज़ार छह सौ लीरा +₺१२००~एक हज़ार दो सौ लीरा +₺१२४~एक सौ चौबीस लीरा +$६९६~छह सौ छियानबे डॉलर +₹१४४~एक सौ चौवालीस रुपए +₺६१५~छह सौ पंद्रह लीरा +₩३३०~तीन सौ तीस वॉन +£७००~सात सौ पाउंड +₹५३०~पाँच सौ तीस रुपए +₺८५~पचासी लीरा +₩१९०~एक सौ नब्बे वॉन +₺६~छह लीरा +₺१००~एक सौ लीरा +£२०~बीस पाउंड +$५०१५~पाँच हज़ार पंद्रह डॉलर +₺६७०~छह सौ सत्तर लीरा +$८००~आठ सौ डॉलर +$७५०००~पचहत्तर हज़ार डॉलर +$७५०~सात सौ पचास डॉलर +₹२१३२~दो हज़ार एक सौ बत्तीस रुपए +₹१९८~एक सौ अट्ठानबे रुपए +₹१११५~एक हज़ार एक सौ पंद्रह रुपए +₺५३०~पाँच सौ तीस लीरा +₺५~पाँच लीरा +₹३१८०~तीन हज़ार एक सौ अस्सी रुपए +₹२४५~दो सौ पैंतालीस रुपए +₹२१४८~दो हज़ार एक सौ अड़तालीस रुपए +₺५१४~पाँच सौ चौदह लीरा +₹१५७४~एक हज़ार पाँच सौ चौहत्तर रुपए +$१५००~एक हज़ार पाँच सौ डॉलर +₹२७५~दो सौ पचहत्तर रुपए +₺२३~तेईस लीरा +₺४०~चालीस लीरा +₺२९१~दो सौ इक्यानबे लीरा +₩३२~बत्तीस वॉन +$५४०~पाँच सौ चालीस डॉलर +$१९४६~एक हज़ार नौ सौ छियालीस डॉलर +₹६५०~छह सौ पचास रुपए +₺४९~उनचास लीरा +₹२१९०~दो हज़ार एक सौ नब्बे रुपए +₹१०००~एक हज़ार रुपए +£१००~एक सौ पाउंड +₹५१३५~पाँच हज़ार एक सौ पैंतीस रुपए +₹३२२~तीन सौ बाईस रुपए +$७~सात डॉलर +₩१९५~एक सौ पंचानबे वॉन +$१०००~एक हज़ार डॉलर +₺१७०८~एक हज़ार सात सौ आठ लीरा +$९~नौ डॉलर +$४४०~चार सौ चालीस डॉलर +$१२८~एक सौ अट्ठाईस डॉलर +₺८~आठ लीरा +£३०~तीस पाउंड +₹१०००~एक हज़ार रुपए +₩५४३~पाँच सौ तैंतालीस वॉन +₹४०५~चार सौ पाँच रुपए +£४०~चालीस पाउंड +₹४७~सैंतालीस रुपए +$६९~उनहत्तर डॉलर +₹२२०~दो सौ बीस रुपए +₹५१~इक्यावन रुपए +₺४५~पैंतालीस लीरा +₹६३३~छह सौ तैंतीस रुपए +$१०००~एक हज़ार डॉलर +$२०००~दो हज़ार डॉलर +₹३२०~तीन सौ बीस रुपए +₹४६७~चार सौ सड़सठ रुपए +₹८११~आठ सौ ग्यारह रुपए +₹१०४०~एक हज़ार चालीस रुपए +$७६५~सात सौ पैंसठ डॉलर +₩५९९~पाँच सौ निन्यानबे वॉन +₹५५०~पाँच सौ पचास रुपए +₹६५६००~पैंसठ हज़ार छह सौ रुपए +$२९~उनतीस डॉलर +₩६००~छह सौ वॉन +₹१८९~एक सौ नवासी रुपए +₹५९१~पाँच सौ इक्यानबे रुपए +₹१६८९~एक हज़ार छह सौ नवासी रुपए +₹१०९~एक सौ नौ रुपए +₺१२~बारह लीरा +₹२०३~दो सौ तीन रुपए +₹६३५~छह सौ पैंतीस रुपए +₹४६९~चार सौ उनहत्तर रुपए +$१२७~एक सौ सत्ताईस डॉलर +$८५~पचासी डॉलर +₹५५००००००~पाँच करोड़ पचास लाख रुपए +$२८२१~दो हज़ार आठ सौ इक्कीस डॉलर +₹१२५४०००~बारह लाख चौवन हज़ार रुपए +₹३१५~तीन सौ पंद्रह रुपए +₹२०४४~दो हज़ार चौवालीस रुपए +₹१००००~दस हज़ार रुपए +₹५४५~पाँच सौ पैंतालीस रुपए +₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए +₹३७२~तीन सौ बहत्तर रुपए +$९८~अट्ठानबे डॉलर \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..9d670aa8a --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_time.txt @@ -0,0 +1,18 @@ +१२:१०:१०~बारह बजकर दस मिनट दस सेकंड +५:१२:०१~पाँच बजकर बारह मिनट एक सेकंड +३:१८:४३~तीन बजकर अठारह मिनट तैंतालीस सेकंड +२:१६~दो बजकर सोलह मिनट +७:२१~सात बजकर इक्कीस मिनट +११:५७~ग्यारह बजकर सत्तावन मिनट +८:००~आठ बजे +४:००~चार बजे +९:००~नौ बजे +सुबह के ५:२०:१२~सुबह के पाँच बजकर बीस मिनट बारह सेकंड +सुबह के ६:३९~सुबह के छह बजकर उनतालीस मिनट +सुबह के २:००~सुबह के दो बजे +दोपहर के ३:५९:३६~दोपहर के तीन बजकर उनसठ मिनट छत्तीस सेकंड +दोपहर के १:३६~दोपहर के एक बजकर छत्तीस मिनट +दोपहर के ३:००~दोपहर के तीन बजे +रात के १०:४८:५०~रात के दस बजकर अड़तालीस मिनट पचास सेकंड +रात के ११:५०~रात के ग्यारह बजकर पचास मिनट +रात के ८:००~रात के आठ बजे \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..68888de9e --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,14 @@ +डॉ. अमित~डॉक्टर अमित +प्रो. पूजा~प्रोफेसर पूजा +इं. रेखा~इंजीनियर रेखा +ले. रोहित~लेफ्टिनेंट रोहित +वै. निखिल~वैज्ञानिक निखिल +कु. मेघा~कुमारी मेघा +मा. अंकित~मास्टर अंकित +डॉ. सीमा~डॉक्टर सीमा +प्रो. अरुण~प्रोफेसर अरुण +इं. सुमित~इंजीनियर सुमित +ले. अंजलि~लेफ्टिनेंट अंजलि +वै. प्रिया~वैज्ञानिक प्रिया +कु. रिया~कुमारी रिया +मा. विवेक~मास्टर विवेक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..e9649919e --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_word.txt @@ -0,0 +1,16 @@ +नींद~नींद +याहू!~याहू! +।~। +आआआ~आआआ +आकाशगंगा~आकाशगंगा +लटरपटर~लटरपटर +कच्चा-पक्का~कच्चा-पक्का +गुब्बारा~गुब्बारा +चिट्ठी~चिट्ठी +ढूंढना~ढूंढना +लोहे का!~लोहे का! +वाह!~वाह! +टाटा~टाटा +~ +झ~झ +संगीत~संगीत \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_cardinal.py b/tests/nemo_text_processing/hi/test_cardinal.py index 4a019bf6f..8298ec0e3 100644 --- a/tests/nemo_text_processing/hi/test_cardinal.py +++ b/tests/nemo_text_processing/hi/test_cardinal.py @@ -17,15 +17,23 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestCardinal: - + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_date.py b/tests/nemo_text_processing/hi/test_date.py index edceddc74..df12e9874 100644 --- a/tests/nemo_text_processing/hi/test_date.py +++ b/tests/nemo_text_processing/hi/test_date.py @@ -12,19 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestDate: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_decimal.py b/tests/nemo_text_processing/hi/test_decimal.py index ad628baac..582b59422 100644 --- a/tests/nemo_text_processing/hi/test_decimal.py +++ b/tests/nemo_text_processing/hi/test_decimal.py @@ -17,17 +17,26 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestDecimal: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_fraction.py b/tests/nemo_text_processing/hi/test_fraction.py index 90c363487..bedf9d0f7 100644 --- a/tests/nemo_text_processing/hi/test_fraction.py +++ b/tests/nemo_text_processing/hi/test_fraction.py @@ -12,22 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestFraction: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_fraction.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_measure.py b/tests/nemo_text_processing/hi/test_measure.py index 9094b778c..71352cdc8 100644 --- a/tests/nemo_text_processing/hi/test_measure.py +++ b/tests/nemo_text_processing/hi/test_measure.py @@ -12,22 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestMeasure: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_money.py b/tests/nemo_text_processing/hi/test_money.py index b9dcd4452..0665146a6 100644 --- a/tests/nemo_text_processing/hi/test_money.py +++ b/tests/nemo_text_processing/hi/test_money.py @@ -12,22 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestMoney: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_money.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_ordinal.py b/tests/nemo_text_processing/hi/test_ordinal.py index 7aedc243f..b65252694 100644 --- a/tests/nemo_text_processing/hi/test_ordinal.py +++ b/tests/nemo_text_processing/hi/test_ordinal.py @@ -17,9 +17,8 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestOrdinal: diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..498443f71 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -0,0 +1,120 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + + # replace non breaking space with breaking space + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + # spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + # denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # trim white space and remove space before punctuation + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +#testTNSpecialText() { +# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_special_text.txt +# runtest $input +#} + +testTNCardinal() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNDate() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_date.txt + runtest $input +} + +testTNDecimal() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +#testTNRange() { +# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_range.txt +# runtest $input +#} + +#testTNSerial() { +# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_serial.txt +# runtest $input +#} + +#testTNRoman() { +# input=$PROJECT_DIR/en/data_text_normalization/test_cases_roman.txt +# runtest $input +#} + +#testTNElectronic() { +# input=$PROJECT_DIR/en/data_text_normalization/test_cases_electronic.txt +# runtest $input +#} + +testTNFraction() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_fraction.txt + runtest $input +} + +testTNMoney() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_money.txt + runtest $input +} + +#testTNOrdinal() { +# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt +# runtest $input +#} + +#testTNTelephone() { +# input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt +# runtest $input +#} + +testTNTime() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt + runtest $input +} + +testTNMeasure() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_measure.txt + runtest $input +} + +testTNWhitelist() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testTNWord() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_word.txt + runtest $input +} + +#testTNAddress() { +# input=$PROJECT_DIR/en/data_text_normalization/test_cases_address.txt +# runtest $input +#} + +#testTNMath() { +# input=$PROJECT_DIR/en/data_text_normalization/test_cases_math.txt +# runtest $input +#} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi/test_time.py b/tests/nemo_text_processing/hi/test_time.py index f71434e95..402faf414 100644 --- a/tests/nemo_text_processing/hi/test_time.py +++ b/tests/nemo_text_processing/hi/test_time.py @@ -12,22 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from nemo_text_processing.text_normalization.normalize import Normalizer -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestTime: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py new file mode 100644 index 000000000..4a090d823 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py new file mode 100644 index 000000000..4d6bd2261 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_word.py @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0ecaf0e8e..23b1f7deb 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -210,6 +210,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + from nemo_text_processing.text_normalization.hi.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.hi.verbalizers.post_processing import ( + PostProcessingFst as TNPostProcessingFst, + ) + from nemo_text_processing.text_normalization.hi.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'hu': from nemo_text_processing.text_normalization.hu.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst,