# 加载bert-base-uncased 的模型和分词器 from transformers import AutoModelForSequenceClassification, AutoTokenizer # 使用本地路径加载模型和分词器(可以在魔搭社区离线下载模型) model = AutoModelForSequenceClassification.from_pretrained("./models_save/bert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("./models_save/bert-base-uncased")
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models_save/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
from datasets import load_dataset # 通过设置streaming=True来使用IterableDataset iterable_dataset = load_dataset("nyu-mll/glue", "mrpc", split="train", streaming=True) for example in iterable_dataset: print(example) break
'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.hf-mirror.com', port=443): Read timed out. (read timeout=10)"), '(Request ID: cf1b68bf-240a-4e80-94a9-0217f4e77ab9)')' thrown while requesting GET https://hf-mirror.com/datasets/nyu-mll/glue/resolve/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/mrpc/train-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': ['They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .'],
'sentence2': ["On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .'],
'label': [1, 0],
'idx': [2, 3]}
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[57], line 2
1 # 但是在IterableDataset中我们无法通过这种方式访问数据,只能通过迭代的方式逐步获取数据
----> 2 ds_iterable[2:4]
File ~/miniconda3/lib/python3.10/site-packages/torch/utils/data/dataset.py:61, in Dataset.__getitem__(self, index)
60 def __getitem__(self, index) -> T_co:
---> 61 raise NotImplementedError("Subclasses of Dataset should implement __getitem__.")
NotImplementedError: Subclasses of Dataset should implement __getitem__.
1 2
# 迭代的方式获取数据 next(iter(ds_iterable))
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
'label': 1,
'idx': 0}
1 2
# 迭代的方式获取指定数量的子集 list(ds_iterable.take(10))
[{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
'label': 1,
'idx': 0},
{'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
'label': 0,
'idx': 1},
{'sentence1': 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
'sentence2': "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
'label': 1,
'idx': 2},
{'sentence1': 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
'sentence2': 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',
'label': 0,
'idx': 3},
{'sentence1': 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .',
'sentence2': 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .',
'label': 1,
'idx': 4},
{'sentence1': 'Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .',
'sentence2': "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
'label': 1,
'idx': 5},
{'sentence1': 'The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .',
'sentence2': 'The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .',
'label': 0,
'idx': 6},
{'sentence1': 'The DVD-CCA then appealed to the state Supreme Court .',
'sentence2': 'The DVD CCA appealed that decision to the U.S. Supreme Court .',
'label': 1,
'idx': 7},
{'sentence1': 'That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .',
'sentence2': 'Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .',
'label': 0,
'idx': 8},
{'sentence1': 'Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .',
'sentence2': 'Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .',
'label': 0,
'idx': 10}]
4. 使用Datasets对数据进行预处理
但是在几乎所有的预处理情况下,根据你的数据集模式,你需要:
对文本数据集进行标记化。
对音频数据集进行重采样。
对图像数据集应用变换。
1 2 3 4 5
from transformers import AutoTokenizer from datasets import load_dataset
'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'