From 562fc4cdfb3f61884fc007bfc3b1ff9dbd70fb77 Mon Sep 17 00:00:00 2001 From: MartinZZM <442823793@qq.com> Date: Sun, 7 Dec 2025 16:34:06 +0800 Subject: [PATCH] fix python code in chapter2 --- chapters/en/chapter2/4.mdx | 6 +++--- chapters/es/chapter2/4.mdx | 6 +++--- chapters/fr/chapter2/4.mdx | 6 +++--- chapters/it/chapter2/4.mdx | 6 +++--- chapters/ja/chapter2/4.mdx | 6 +++--- chapters/ko/chapter2/4.mdx | 6 +++--- chapters/my/chapter2/4.mdx | 6 +++--- chapters/pt/chapter2/4.mdx | 6 +++--- chapters/ro/chapter2/4.mdx | 6 +++--- chapters/ru/chapter2/4.mdx | 6 +++--- chapters/th/chapter2/4.mdx | 6 +++--- chapters/vi/chapter2/4.mdx | 6 +++--- chapters/zh-CN/chapter2/4.mdx | 6 +++--- chapters/zh-TW/chapter2/4.mdx | 6 +++--- 14 files changed, 42 insertions(+), 42 deletions(-) diff --git a/chapters/en/chapter2/4.mdx b/chapters/en/chapter2/4.mdx index 024af4860..df0ddb815 100644 --- a/chapters/en/chapter2/4.mdx +++ b/chapters/en/chapter2/4.mdx @@ -129,7 +129,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") We can now use the tokenizer as shown in the previous section: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -167,7 +167,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -210,7 +210,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Note that the `decode` method not only converts the indices back to tokens, but also groups together the tokens that were part of the same words to produce a readable sentence. This behavior will be extremely useful when we use models that predict new text (either text generated from a prompt, or for sequence-to-sequence problems like translation or summarization). diff --git a/chapters/es/chapter2/4.mdx b/chapters/es/chapter2/4.mdx index 1202ab35a..003009379 100644 --- a/chapters/es/chapter2/4.mdx +++ b/chapters/es/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Ahora podemos utilizar el tokenizador como se muestra en la sección anterior: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -229,7 +229,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Notemos que el método `decode` no sólo convierte los índices de nuevo en tokens, sino que también agrupa los tokens que formaban parte de las mismas palabras para producir una frase legible. Este comportamiento será extremadamente útil cuando utilicemos modelos que predigan texto nuevo (ya sea texto generado a partir de una indicación, o para problemas de secuencia a secuencia como la traducción o el resumen). diff --git a/chapters/fr/chapter2/4.mdx b/chapters/fr/chapter2/4.mdx index 8ad739967..1137ad684 100644 --- a/chapters/fr/chapter2/4.mdx +++ b/chapters/fr/chapter2/4.mdx @@ -161,7 +161,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Nous pouvons à présent utiliser le *tokenizer* comme indiqué dans la section précédente : ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -201,7 +201,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -246,7 +246,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Notez que la méthode `decode` non seulement reconvertit les indices en *tokens* mais regroupe également les *tokens* faisant partie des mêmes mots. Le but étant de produire une phrase lisible. Ce comportement sera extrêmement utile lorsque dans la suite du cours nous utiliserons des modèles pouvant produire du nouveau texte (soit du texte généré à partir d'un *prompt*, soit pour des problèmes de séquence à séquence comme la traduction ou le résumé de texte). diff --git a/chapters/it/chapter2/4.mdx b/chapters/it/chapter2/4.mdx index 7e9d0e244..3b41445fa 100644 --- a/chapters/it/chapter2/4.mdx +++ b/chapters/it/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Ora possiamo usare il tokenizer come mostrato nella sezione precedente: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -229,7 +229,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Si noti che il metodo `decode` non solo converte gli indici in token, ma raggruppa anche i token che fanno parte delle stesse parole per produrre una frase leggibile. Questo comportamento sarà estremamente utile quando utilizzeremo modelli che prevedono un nuovo testo (o un testo generato da un prompt, o per problemi di sequenza-sequenza come la traduzione o il riassunto). diff --git a/chapters/ja/chapter2/4.mdx b/chapters/ja/chapter2/4.mdx index 397681a53..bff0019f5 100644 --- a/chapters/ja/chapter2/4.mdx +++ b/chapters/ja/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") そして、前のセクションで見たようにトークナイザを使用することができます。 ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -229,7 +229,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` `decode` メソッドは語彙のインデックスをトークンに戻すだけでなく、同じ単語の一部であったトークンをまとめて、読みやすい文章に変換するところも担っています。この挙動は、プロンプトから生成されたテキストや、翻訳や要約などの系列から系列への変換などの問題を解くモデルを使うときに非常に役に立ちます。 diff --git a/chapters/ko/chapter2/4.mdx b/chapters/ko/chapter2/4.mdx index 035a16b61..42135e223 100644 --- a/chapters/ko/chapter2/4.mdx +++ b/chapters/ko/chapter2/4.mdx @@ -149,7 +149,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 이제 이전 섹션에서 본 것처럼 토크나이저를 사용할 수 있습니다. ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -187,7 +187,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -230,7 +230,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` `decode` 메서드는 인덱스를 토큰으로 바꿀 뿐만 아니라, 읽기 좋은 문장을 만들기 위해 같은 단어의 일부인 토큰을 그룹화합니다. 이 과정은 새 텍스트(프롬프트에서 생성된 텍스트 또는 번역이나 요약과 같은 시퀀스 간 문제)를 예측하는 모델을 쓸 때 매우 유용합니다. diff --git a/chapters/my/chapter2/4.mdx b/chapters/my/chapter2/4.mdx index c1acefe9a..0a5e00c59 100644 --- a/chapters/my/chapter2/4.mdx +++ b/chapters/my/chapter2/4.mdx @@ -129,7 +129,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") အခု ကျွန်တော်တို့ tokenizer ကို ယခင်အပိုင်းမှာ ပြသခဲ့သလို အသုံးပြုနိုင်ပါပြီ။ ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -167,7 +167,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -210,7 +210,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` `decode` method က indices တွေကို tokens တွေအဖြစ် ပြန်ပြောင်းပေးရုံသာမကဘဲ၊ တူညီတဲ့ စကားလုံးရဲ့ အစိတ်အပိုင်းဖြစ်တဲ့ tokens တွေကို စုစည်းပြီး ဖတ်လို့ရတဲ့ စာကြောင်းတစ်ခုကို ထုတ်လုပ်ပေးတာကို သတိပြုပါ။ ဒီ behavior က text အသစ်တွေကို ခန့်မှန်းတဲ့ model တွေကို အသုံးပြုတဲ့အခါ (prompt တစ်ခုကနေ ထုတ်လုပ်တဲ့ text ဖြစ်စေ၊ ဒါမှမဟုတ် translation သို့မဟုတ် summarization လို sequence-to-sequence ပြဿနာတွေအတွက် ဖြစ်စေ) အလွန်အသုံးဝင်ပါလိမ့်မယ်။ diff --git a/chapters/pt/chapter2/4.mdx b/chapters/pt/chapter2/4.mdx index a0346e272..191a501b7 100644 --- a/chapters/pt/chapter2/4.mdx +++ b/chapters/pt/chapter2/4.mdx @@ -151,7 +151,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Agora podemos usar o tokenizer, como mostrado na seção anterior: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -191,7 +191,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -235,7 +235,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Observe que o método `decode` não apenas converte os índices em tokens, mas também agrupa os tokens que fizeram parte das mesmas palavras para produzir uma frase legível. Este comportamento será extremamente útil quando utilizamos modelos que preveem um novo texto (seja texto gerado a partir de um prompt, ou para problemas de _sequence-to-sequence_ como tradução ou sumarização). diff --git a/chapters/ro/chapter2/4.mdx b/chapters/ro/chapter2/4.mdx index e5845c115..72304e3e3 100644 --- a/chapters/ro/chapter2/4.mdx +++ b/chapters/ro/chapter2/4.mdx @@ -158,7 +158,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Acum putem utiliza tokenizatorul așa cum am arătat în secțiunea anterioară: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -196,7 +196,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -239,7 +239,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Rețineți că metoda `decode` nu numai că convertește indicii înapoi în token-uri, dar și grupează token-urile care fac parte din aceleași cuvinte pentru a produce o propoziție inteligibilă. Acest comportament va fi extrem de util atunci când vom utiliza modele care prezic text nou (fie text generat de un prompt, fie pentru probleme sequence-to-sequence, precum traducerea sau rezumarea). diff --git a/chapters/ru/chapter2/4.mdx b/chapters/ru/chapter2/4.mdx index 167cce0d2..78a5cb521 100644 --- a/chapters/ru/chapter2/4.mdx +++ b/chapters/ru/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Теперь мы можем использовать токенизатор, как показано в предыдущем разделе: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -229,7 +229,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Обратите внимание, что метод `decode` не только преобразует индексы обратно в токены, но и группирует токены, которые были частью одних и тех же слов, чтобы создать читаемое предложение. Такое поведение будет очень полезно, когда мы будем использовать модели, прогнозирующие новый текст (либо текст, сгенерированный из подсказки (prompt), либо для решения задачи преобразования последовательности-в-последовательность (sequence-to-sequence), такой как перевод или резюмирование). diff --git a/chapters/th/chapter2/4.mdx b/chapters/th/chapter2/4.mdx index c22c0969b..d77f49047 100644 --- a/chapters/th/chapter2/4.mdx +++ b/chapters/th/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") ตอนนี้เราสามารถใช้ tokenizer เหมือนที่แสดงใน section ที่แล้ว: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -229,7 +229,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` วิธี `decode` ไม่ได้ทำแค่การแปลงดัชนี(indices) ไปเป็น token เท่านั้น แต่ยังทำการรวม tokens ต่างๆที่เป็นส่วนหนึ่งของคำเดียวกันเพื่อสร้างประโยคที่สามารถอ่านได้ กระบวนการเช่นนี้จะเป็นประโยชน์อย่างมากเมื่อเราใช้โมเดลสำหรับทำนายข้อความใหม่(ไม่ว่าจะเป็นข้อความที่สร้างจาก prompt หรือปัญหาประเภท sequence-to-sequence เช่น การแปล(translation) หรือ การสรุปใจความสำคัญ(summarization)) diff --git a/chapters/vi/chapter2/4.mdx b/chapters/vi/chapter2/4.mdx index 08c9ecee2..c7ba7c1ed 100644 --- a/chapters/vi/chapter2/4.mdx +++ b/chapters/vi/chapter2/4.mdx @@ -146,7 +146,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") Giờ chúng ta có thể sử dụng tokenizer như trong đoạn dưới đây: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -184,7 +184,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -227,7 +227,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` Lưu ý rằng phương pháp `giải mã` không chỉ chuyển đổi các chỉ số trở lại thành token, mà còn nhóm các token là một phần của cùng một từ lại với nhau để tạo ra một câu có thể đọc được. Hành vi này sẽ cực kỳ hữu ích khi chúng ta sử dụng các mô hình dự đoán văn bản mới (văn bản được tạo từ lời nhắc hoặc đối với các bài toán chuỗi-sang-chuỗi như dịch hoặc tóm tắt văn bản). diff --git a/chapters/zh-CN/chapter2/4.mdx b/chapters/zh-CN/chapter2/4.mdx index 5b57e7222..ffc189347 100644 --- a/chapters/zh-CN/chapter2/4.mdx +++ b/chapters/zh-CN/chapter2/4.mdx @@ -150,7 +150,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 现在我们可以像在上一节中显示的那样使用 tokenizer: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -188,7 +188,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -231,7 +231,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` 请注意, `decode` 方法不仅将索引转换回 tokens,还将属于相同单词的 tokens 组合在一起以生成可读的句子。当我们使用预测新文本的模型(根据提示生成的文本,或序列到序列问题(如翻译或摘要))时,这样的功能将非常有用。 diff --git a/chapters/zh-TW/chapter2/4.mdx b/chapters/zh-TW/chapter2/4.mdx index d7991f260..d3e568add 100644 --- a/chapters/zh-TW/chapter2/4.mdx +++ b/chapters/zh-TW/chapter2/4.mdx @@ -148,7 +148,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 我們現在可以使用標記器(tokenizer),如上一節所示: ```python -tokenizer("Using a Transformer network is simple") +tokenizer("Using a transformer network is simple") ``` ```python out @@ -186,7 +186,7 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -sequence = "Using a Transformer network is simple" +sequence = "Using a transformer network is simple" tokens = tokenizer.tokenize(sequence) print(tokens) @@ -228,7 +228,7 @@ print(decoded_string) ``` ```python out -'Using a Transformer network is simple' +'Using a transformer network is simple' ``` 請注意, `decode` 方法不僅將索引轉換回標記(token),還將屬於相同單詞的標記(token)組合在一起以生成可讀的句子。當我們使用預測新文本的模型(根據提示生成的文本,或序列到序列問題(如翻譯或摘要))時,這種行為將非常有用。