huggingface · CaptainArshia · Nov 30, 2025 · Nov 30, 2025
diff --git a/chapters/de/chapter3/2.mdx b/chapters/de/chapter3/2.mdx
@@ -167,8 +167,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Wir können jedoch nicht einfach zwei Sequenzen an das Modell übergeben und eine Vorhersage erhalten, ob die beiden Sätze paraphrasiert sind oder nicht. Wir müssen die beiden Sequenzen als Paar behandeln und die entsprechende Vorverarbeitung anwenden. Glücklicherweise kann der Tokenizer auch ein Sequenzpaar nehmen und es so vorbereiten, wie es unser BERT-Modell erwartet:
@@ -224,8 +224,8 @@ Nachdem wir nun gesehen haben, wie unser Tokenizer mit einem Satzpaar umgehen ka
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/en/chapter3/2.mdx b/chapters/en/chapter3/2.mdx
@@ -120,8 +120,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 > [!TIP]
@@ -180,8 +180,8 @@ Now that we have seen how our tokenizer can deal with one pair of sentences, we
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/es/chapter3/2.mdx b/chapters/es/chapter3/2.mdx
@@ -171,8 +171,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Sin embargo, no podemos simplemente pasar dos secuencias al modelo y obtener una predicción indicando si estas son paráfrasis o no. Necesitamos manipular las dos secuencias como un par y aplicar el preprocesamiento apropiado.
@@ -229,8 +229,8 @@ Ahora que hemos visto como nuestro tokenizador puede trabajar con un par de orac
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/fa/chapter3/2.mdx b/chapters/fa/chapter3/2.mdx
@@ -207,8 +207,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 </div>
@@ -288,8 +288,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/fr/chapter3/2.mdx b/chapters/fr/chapter3/2.mdx
@@ -177,8 +177,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Cependant, nous ne pouvons pas simplement passer deux séquences au modèle et obtenir une prédiction pour savoir si les deux phrases sont des paraphrases ou non. Nous devons traiter les deux séquences comme une paire, et appliquer le prétraitement approprié. Heureusement, le *tokenizer* peut également prendre une paire de séquences et la préparer de la manière attendue par notre modèle BERT : 
@@ -236,8 +236,8 @@ Maintenant que nous avons vu comment notre *tokenizer* peut traiter une paire de
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/hi/chapter3/2.mdx b/chapters/hi/chapter3/2.mdx
@@ -167,8 +167,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 हालाँकि, हम केवल दो अनुक्रमों को मॉडल में पारित नहीं कर सकते और प्रिडिक्शन कर सकते कि दो वाक्य पैराफ्रेश हैं या नहीं। हमें दो अनुक्रमों को एक जोड़ी के रूप में संभालने की जरूरत है, और उपयुक्त पूर्व प्रसंस्करण लागू करना है। सौभाग्य से, टोकननाइज़र अनुक्रमों की जोड़ी भी ले सकता है और इसे हमारे BERT मॉडल की अपेक्षा के अनुसार तैयार कर सकता है: 
@@ -224,8 +224,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/it/chapter3/2.mdx b/chapters/it/chapter3/2.mdx
@@ -168,8 +168,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Tuttavia, non si possono semplicemente passare al modello due frasi e sperare di predire se l'una è una parafrasi dell'altra o no. Bisogna gestire le due frasi come una coppia, e applicare il preprocessing necessario. Fortunatamente, il tokenizer può anche prendere come input una coppia di frasi e prepararla nel formato atteso dal modello BERT:
@@ -225,8 +225,8 @@ Ora che abbiamo visto come il tokenizer può gestire una coppia di frasi, possia
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/ko/chapter3/2.mdx b/chapters/ko/chapter3/2.mdx
@@ -120,8 +120,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 > [!TIP]
@@ -180,8 +180,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/my/chapter3/2.mdx b/chapters/my/chapter3/2.mdx
@@ -121,8 +121,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 > [!TIP]
@@ -181,8 +181,8 @@ next sentence prediction မှာ၊ model ကို စာကြောင်
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/ro/chapter3/2.mdx b/chapters/ro/chapter3/2.mdx
@@ -166,8 +166,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Cu toate acestea, nu putem pur și simplu să transmitem două secvențe modelului și să obținem o predicție care să indice dacă cele două propoziții sunt parafraze sau nu. Trebuie să tratăm cele două secvențe ca pe o pereche și să aplicăm preprocesarea corespunzătoare. Din fericire, tokenizatorul poate, de asemenea, să ia o pereche de secvențe și să le pregătească în modul în care se așteaptă modelul nostru BERT: 
@@ -223,8 +223,8 @@ Acum că am văzut cum tokenizatorul nostru poate trata o pereche de propoziții
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/ru/chapter3/2.mdx b/chapters/ru/chapter3/2.mdx
@@ -167,8 +167,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Однако мы не можем просто передать две последовательности в модель и получить прогноз того, являются ли эти два предложения парафразами или нет. Нам нужно обрабатывать две последовательности как пару и применять соответствующую предварительную обработку. К счастью, токенизатор также может взять пару последовательностей и подготовить их так, как ожидает наша модель BERT:
@@ -224,8 +224,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/th/chapter3/2.mdx b/chapters/th/chapter3/2.mdx
@@ -167,8 +167,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 อย่างไรก็ตาม การส่งเพียงข้อมูลสองลำดับ (sequences) ในลักษณะนี้เข้าไปยังไม่เพียงพอที่จะทำให้โมเดลสามารถเรียนรู้และทำนายว่าประโยคทั้งสองนี้เป็นประโยคที่เกิดจากการถอดความ (paraphrase) หรือไม่ เราจะต้องจัดการให้ประโยคทั้งสองเป็นคู่กันก่อนแล้วค่อยทำการประมวลผลให้เหมาะสม ซึ่งโชคดีมากที่ tokenizer สามารถรับข้อมูลคู่ของลำดับแล้วเตรียมข้อมูลให้อยู่ในรูปแบบที่เหมาะสมกับการป้อนเข้าโมเดล BERT ของเรา:
@@ -224,8 +224,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/vi/chapter3/2.mdx b/chapters/vi/chapter3/2.mdx
@@ -164,8 +164,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 Tuy nhiên, chúng ta không thể chỉ chuyển hai chuỗi vào mô hình và nhận được dự đoán liệu hai câu có phải là diễn giải hay không. Chúng ta cần xử lý hai chuỗi như một cặp và áp dụng tiền xử lý thích hợp. May mắn thay, tokenizer cũng có thể nhận một cặp chuỗi và chuẩn bị nó theo cách mà mô hình BERT của ta mong đợi:
@@ -221,8 +221,8 @@ Bây giờ chúng ta đã thấy cách trình tokenize của chúng ta có thể
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/zh-CN/chapter3/2.mdx b/chapters/zh-CN/chapter3/2.mdx
@@ -176,8 +176,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 不过在将两句话传递给模型，预测这两句话是否是同义之前，我们需要给这两句话依次进行适当的预处理。Tokenizer 不仅仅可以输入单个句子，还可以输入一组句子，并按照 BERT 模型所需要的输入进行处理：
@@ -282,8 +282,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )

diff --git a/chapters/zh-TW/chapter3/2.mdx b/chapters/zh-TW/chapter3/2.mdx
@@ -164,8 +164,8 @@ from transformers import AutoTokenizer
 
 checkpoint = "bert-base-uncased"
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
-tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
+tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"]))
+tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"]))
 ```
 
 然而，在兩句話傳遞給模型，預測這兩句話是否是同義之前。我們需要這兩句話依次進行適當的預處理。幸運的是，標記器不僅僅可以輸入單個句子還可以輸入一組句子，並按照我們的BERT模型所期望的輸入進行處理：
@@ -221,8 +221,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"])
 
 ```py
 tokenized_dataset = tokenizer(
-    raw_datasets["train"]["sentence1"],
-    raw_datasets["train"]["sentence2"],
+    list(raw_datasets["train"]["sentence1"]),
+    list(raw_datasets["train"]["sentence2"]),
     padding=True,
     truncation=True,
 )