18 Commits

Author SHA1 Message Date
  color d1a8e6d7f0 Merge pull request 'master' (#21) from MindSpore/mindnlp:master into master 1 year ago
  邓伟键 cf916aa4a5
开源实习RAG模型 (#1479) 1 year ago
  邓伟键 38311bdcd9
开源实习Idefics模型 (#1480) 1 year ago
  nate.river 2090f4dbb4
fix hubert & Module hooks (#1532) 1 year ago
  ShangJingLi cdf4aeb832
florence2添加requirement后 (#1512) 1 year ago
  1ovegood d9b5be5fad
【开源实习】SuperPoint模型迁移 (#1488) 1 year ago
  mysterious hhhh fa77277794
【开源实习】NLLB模型迁移 (#1426) 1 year ago
  wuzhirong520 aee44a1e7b
TVLT(refactored) (#1444) 1 year ago
  wuzhirong520 ad2cc7d0da
REALM(refactored) (#1432) 1 year ago
  wuzhirong520 c2d7cda0ea
FSMT(refactored) (#1421) 1 year ago
  wuzhirong520 77ee83c111
FUYU(refactored) (#1410) 1 year ago
  nate.river 3f7d094a35
udate qwen2 & fix bugs (#1523) 1 year ago
  nate.river 50f7be1ea3
fix gpt_summarization (#1519) 1 year ago
  nate.river 0253ec1884
add lr_scheduler/clip_grad and fix GroupNorm error. (#1517) 1 year ago
  nate.river 94fda61863
fix llama on MS2.2 GPU (#1514) 1 year ago
  nate.river c112cc3371
fix pylint error on github CI (#1510) 1 year ago
  nate.river 9cbb9b81fb
fix errors & add nll_loss, logsigmoid (#1507) 1 year ago
  nate.river 00c521e92f
add glm4 (#1504) 1 year ago
100 changed files with 21496 additions and 4527 deletions
Split View
  1. +3
    -3
      README.md
  2. +294
    -41
      examples/classification/bert_emotect_finetune.ipynb
  3. +0
    -280
      examples/classification/bert_graph_emotect_finetune.ipynb
  4. +230
    -129
      examples/classification/bert_imdb_finetune.ipynb
  5. +97
    -187
      examples/text_generation/gpt2_summarization.ipynb
  6. +30
    -0
      llm/inference/chatglm4/simple_inference.py
  7. +0
    -0
      llm/inference/llama2/origin_llama/download.sh
  8. +0
    -0
      llm/inference/llama2/origin_llama/example_chat_completion.py
  9. +0
    -0
      llm/inference/llama2/origin_llama/example_text_completion.py
  10. +0
    -0
      llm/inference/llama2/origin_llama/llama/__init__.py
  11. +0
    -0
      llm/inference/llama2/origin_llama/llama/generation.py
  12. +0
    -0
      llm/inference/llama2/origin_llama/llama/model.py
  13. +0
    -0
      llm/inference/llama2/origin_llama/llama/tokenizer.py
  14. +54
    -0
      llm/inference/llama2/simple_inference_with_static_cache.py
  15. +0
    -1
      llm/inference/llama3/run_llama3.py
  16. +4
    -0
      mindnlp/configs.py
  17. +1
    -20
      mindnlp/core/__init__.py
  18. +2
    -0
      mindnlp/core/autograd/__init__.py
  19. +34
    -0
      mindnlp/core/autograd/grad_mode.py
  20. +17
    -2
      mindnlp/core/nn/functional.py
  21. +4
    -4
      mindnlp/core/nn/modules/activation.py
  22. +5
    -1
      mindnlp/core/nn/modules/conv.py
  23. +163
    -4
      mindnlp/core/nn/modules/module.py
  24. +13
    -8
      mindnlp/core/nn/modules/normalization.py
  25. +3
    -0
      mindnlp/core/nn/utils/__init__.py
  26. +120
    -0
      mindnlp/core/nn/utils/clip_grad.py
  27. +3
    -2
      mindnlp/core/nn/utils/weight_norm.py
  28. +17
    -0
      mindnlp/core/ops/array.py
  29. +7
    -3
      mindnlp/core/ops/creation.py
  30. +3
    -0
      mindnlp/core/ops/random.py
  31. +3
    -0
      mindnlp/core/optim/__init__.py
  32. +2
    -2
      mindnlp/core/optim/adam.py
  33. +2
    -2
      mindnlp/core/optim/adamw.py
  34. +2166
    -0
      mindnlp/core/optim/lr_scheduler.py
  35. +2
    -2
      mindnlp/core/optim/sgd.py
  36. +95
    -8
      mindnlp/core/serialization.py
  37. +23
    -23
      mindnlp/engine/trainer/base.py
  38. +11
    -10
      mindnlp/engine/utils.py
  39. +5
    -0
      mindnlp/peft/config.py
  40. +0
    -1
      mindnlp/peft/mapping.py
  41. +7
    -8
      mindnlp/peft/tuners/lora/layer.py
  42. +1
    -1
      mindnlp/peft/tuners/lora/model.py
  43. +2
    -1
      mindnlp/transformers/activations.py
  44. +29
    -34
      mindnlp/transformers/cache_utils.py
  45. +4
    -1
      mindnlp/transformers/configuration_utils.py
  46. +1
    -1
      mindnlp/transformers/generation/candidate_generator.py
  47. +7
    -7
      mindnlp/transformers/generation/configuration_utils.py
  48. +17
    -4
      mindnlp/transformers/generation/logits_process.py
  49. +1
    -1
      mindnlp/transformers/generation/stopping_criteria.py
  50. +31
    -8
      mindnlp/transformers/generation/utils.py
  51. +540
    -0
      mindnlp/transformers/modeling_rope_utils.py
  52. +24
    -9
      mindnlp/transformers/modeling_utils.py
  53. +27
    -1
      mindnlp/transformers/models/__init__.py
  54. +6
    -6
      mindnlp/transformers/models/auto/auto_factory.py
  55. +36
    -0
      mindnlp/transformers/models/auto/configuration_auto.py
  56. +3
    -3
      mindnlp/transformers/models/auto/image_processing_auto.py
  57. +6
    -0
      mindnlp/transformers/models/auto/modeling_auto.py
  58. +1
    -0
      mindnlp/transformers/models/auto/processing_auto.py
  59. +2
    -2
      mindnlp/transformers/models/auto/tokenization_auto.py
  60. +3
    -3
      mindnlp/transformers/models/blip_2/modeling_blip_2.py
  61. +10
    -0
      mindnlp/transformers/models/chatglm4/__init__.py
  62. +61
    -0
      mindnlp/transformers/models/chatglm4/configuration_chatglm4.py
  63. +1017
    -0
      mindnlp/transformers/models/chatglm4/modeling_chatglm4.py
  64. +233
    -0
      mindnlp/transformers/models/chatglm4/tokenization_chatglm4.py
  65. +29
    -0
      mindnlp/transformers/models/florence2/__init__.py
  66. +333
    -0
      mindnlp/transformers/models/florence2/configuration_florence2.py
  67. +2561
    -0
      mindnlp/transformers/models/florence2/modeling_florence2.py
  68. +1098
    -0
      mindnlp/transformers/models/florence2/processing_florence2.py
  69. +26
    -0
      mindnlp/transformers/models/fsmt/__init__.py
  70. +216
    -0
      mindnlp/transformers/models/fsmt/configuration_fsmt.py
  71. +1273
    -0
      mindnlp/transformers/models/fsmt/modeling_fsmt.py
  72. +520
    -0
      mindnlp/transformers/models/fsmt/tokenization_fsmt.py
  73. +28
    -0
      mindnlp/transformers/models/fuyu/__init__.py
  74. +229
    -0
      mindnlp/transformers/models/fuyu/configuration_fuyu.py
  75. +734
    -0
      mindnlp/transformers/models/fuyu/image_processing_fuyu.py
  76. +295
    -0
      mindnlp/transformers/models/fuyu/modeling_fuyu.py
  77. +700
    -0
      mindnlp/transformers/models/fuyu/processing_fuyu.py
  78. +2
    -8
      mindnlp/transformers/models/hubert/__init__.py
  79. +19
    -87
      mindnlp/transformers/models/hubert/configuration_hubert.py
  80. +308
    -1111
      mindnlp/transformers/models/hubert/modeling_hubert.py
  81. +29
    -0
      mindnlp/transformers/models/idefics/__init__.py
  82. +317
    -0
      mindnlp/transformers/models/idefics/configuration_idefics.py
  83. +173
    -0
      mindnlp/transformers/models/idefics/image_processing_idefics.py
  84. +1555
    -0
      mindnlp/transformers/models/idefics/modeling_idefics.py
  85. +196
    -0
      mindnlp/transformers/models/idefics/perceiver.py
  86. +429
    -0
      mindnlp/transformers/models/idefics/processing_idefics.py
  87. +515
    -0
      mindnlp/transformers/models/idefics/vision.py
  88. +17
    -26
      mindnlp/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
  89. +66
    -86
      mindnlp/transformers/models/llama/configuration_llama.py
  90. +627
    -1084
      mindnlp/transformers/models/llama/modeling_llama.py
  91. +49
    -219
      mindnlp/transformers/models/llama/tokenization_llama.py
  92. +57
    -204
      mindnlp/transformers/models/llama/tokenization_llama_fast.py
  93. +58
    -0
      mindnlp/transformers/models/nllb/__init__.py
  94. +442
    -0
      mindnlp/transformers/models/nllb/tokenization_nllb.py
  95. +395
    -0
      mindnlp/transformers/models/nllb/tokenization_nllb_fast.py
  96. +378
    -879
      mindnlp/transformers/models/qwen2/modeling_qwen2.py
  97. +25
    -0
      mindnlp/transformers/models/rag/__init__.py
  98. +127
    -0
      mindnlp/transformers/models/rag/configuration_rag.py
  99. +1521
    -0
      mindnlp/transformers/models/rag/modeling_rag.py
  100. +667
    -0
      mindnlp/transformers/models/rag/retrieval_rag.py

+ 3
- 3
README.md View File

@@ -25,9 +25,9 @@

* 🤗 Hugging *huggingface* ecosystem, we use **datasets** lib as default dataset loader to support
mounts of useful datasets.
* 📝 MindNLP supports NLP tasks such as *language model*, *machine translation*, *question answering*, *sentiment analysis*, *sequence labeling*, *summarization*, etc. You can access them through [examples](https://github.com/mindspore-lab/mindnlp/examples/).
* 🚀 MindNLP currently supports industry-leading Large Language Models (LLMs), including **Llama**, **GLM**, **RWKV**, etc. For support related to large language models, including ***pre-training***, ***fine-tuning***, and **inference** demo examples, you can find them in the ["llm" directory](https://github.com/mindspore-lab/mindnlp/llm/).
* 🤗 Pretrained models support ***huggingface transformers-like apis***, including **60+** models like **[BERT](https://github.com/mindspore-lab/mindnlp/mindnlp/transformers/models/bert)**, **[Roberta](https://github.com/mindspore-lab/mindnlp/mindnlp/transformers/models/roberta)**, **[GPT2](https://github.com/mindspore-lab/mindnlp/mindnlp/transformers/models/gpt2)**, **[T5](https://github.com/mindspore-lab/mindnlp/mindnlp/transformers/models/t5)**, etc.
* 📝 MindNLP supports NLP tasks such as *language model*, *machine translation*, *question answering*, *sentiment analysis*, *sequence labeling*, *summarization*, etc. You can access them through [examples](./examples/).
* 🚀 MindNLP currently supports industry-leading Large Language Models (LLMs), including **Llama**, **GLM**, **RWKV**, etc. For support related to large language models, including ***pre-training***, ***fine-tuning***, and **inference** demo examples, you can find them in the ["llm" directory](./llm/).
* 🤗 Pretrained models support ***huggingface transformers-like apis***, including **60+** models like **[BERT](./mindnlp/transformers/models/bert)**, **[Roberta](./mindnlp/transformers/models/roberta)**, **[GPT2](./mindnlp/transformers/models/gpt2)**, **[T5](./mindnlp/transformers/models/t5)**, etc.
You can use them easily by following code snippet:
```python
from mindnlp.transformers import AutoModel


+ 294
- 41
examples/classification/bert_emotect_finetune.ipynb View File

@@ -2,24 +2,49 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 0.914 seconds.\n",
"Prefix dict has been built successfully.\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/Cython/Compiler/Main.py:381: FutureWarning: Cython directive 'language_level' not set, using '3str' for now (Py3). This has changed from earlier releases! File: /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/models/graphormer/algos_graphormer.pyx\n",
" tree = Parsing.p_module(s, pxd, full_module_name)\n",
"In file included from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929,\n",
" from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,\n",
" from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/arrayobject.h:5,\n",
" from /home/lvyufeng/.pyxbld/temp.linux-aarch64-cpython-39/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/models/graphormer/algos_graphormer.c:1240:\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning \"Using deprecated NumPy API, disable it with \" \"#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION\" [-Wcpp]\n",
" 17 | #warning \"Using deprecated NumPy API, disable it with \" \\\n",
" | ^~~~~~~\n"
]
}
],
"source": [
"import os\n",
"\n",
"import mindspore\n",
"from mindspore.dataset import text, GeneratorDataset, transforms\n",
"from mindspore import nn, context\n",
"from mindspore.dataset import GeneratorDataset, transforms\n",
"\n",
"from mindnlp.engine import Trainer"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"tags": []
},
@@ -52,11 +77,35 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-08-07 23:03:04-- https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz\n",
"Resolving baidu-nlp.bj.bcebos.com (baidu-nlp.bj.bcebos.com)... 198.18.0.38\n",
"Connecting to baidu-nlp.bj.bcebos.com (baidu-nlp.bj.bcebos.com)|198.18.0.38|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1710581 (1.6M) [application/x-gzip]\n",
"Saving to: ‘emotion_detection.tar.gz’\n",
"\n",
"emotion_detection.t 100%[===================>] 1.63M 7.56MB/s in 0.2s \n",
"\n",
"2024-08-07 23:03:04 (7.56 MB/s) - ‘emotion_detection.tar.gz’ saved [1710581/1710581]\n",
"\n",
"data/\n",
"data/test.tsv\n",
"data/infer.tsv\n",
"data/dev.tsv\n",
"data/train.tsv\n",
"data/vocab.txt\n"
]
}
],
"source": [
"# download dataset\n",
"!wget https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz -O emotion_detection.tar.gz\n",
@@ -65,14 +114,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def process_dataset(source, tokenizer, max_seq_len=64, batch_size=32, shuffle=True):\n",
" is_ascend = mindspore.get_context('device_target') == 'Ascend'\n",
"\n",
@@ -109,11 +156,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n"
]
}
],
"source": [
"from mindnlp.transformers import BertTokenizer\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')"
@@ -121,16 +177,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.pad_token_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {
"tags": []
},
@@ -143,52 +210,111 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['input_ids', 'attention_mask', 'labels']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train.get_col_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"mindspore.dataset.engine.datasets.BatchDataset"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(dataset_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': Tensor(shape=[32, 64], dtype=Int64, value=\n",
"[[ 101, 6656, 6028 ... 0, 0, 0],\n",
" [ 101, 1920, 2157 ... 0, 0, 0],\n",
" [ 101, 1963, 862 ... 0, 0, 0],\n",
" ...\n",
" [ 101, 1762, 1469 ... 0, 0, 0],\n",
" [ 101, 872, 2682 ... 0, 0, 0],\n",
" [ 101, 2769, 809 ... 0, 0, 0]]), 'attention_mask': Tensor(shape=[32, 64], dtype=Int64, value=\n",
"[[1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" ...\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0]]), 'labels': Tensor(shape=[32], dtype=Int32, value= [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, \n",
" 1, 1, 1, 1, 2, 1, 1, 0])}\n"
]
}
],
"source": [
"print(next(dataset_train.create_dict_iterator()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[MS_ALLOC_CONF]Runtime config: enable_vmm:True vmm_align_size:2MB\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"from mindnlp.transformers import BertForSequenceClassification, BertModel\n",
"from mindnlp._legacy.amp import auto_mixed_precision\n",
"\n",
"# set bert config and define parameters for training\n",
"model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)\n",
"model = auto_mixed_precision(model, 'O1')"
"model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"tags": []
},
@@ -208,11 +334,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import evaluate\n",
"from mindnlp import evaluate\n",
"import numpy as np\n",
"\n",
"metric = evaluate.load(\"accuracy\")\n",
@@ -225,7 +351,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -240,11 +366,110 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "184a4b63a184444a92531a00cda5bb90",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'loss': 0.3344, 'learning_rate': 5e-05, 'epoch': 1.0}\n",
"\\\r"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'eval_loss': 0.17230123281478882, 'eval_accuracy': 0.9388888888888889, 'eval_runtime': 2.4432, 'eval_samples_per_second': 13.916, 'eval_steps_per_second': 2.047, 'epoch': 1.0}\n",
"{'loss': 0.2267, 'learning_rate': 5e-05, 'epoch': 2.0}\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'eval_loss': 0.1373114436864853, 'eval_accuracy': 0.9527777777777777, 'eval_runtime': 2.6546, 'eval_samples_per_second': 12.808, 'eval_steps_per_second': 1.884, 'epoch': 2.0}\n",
"{'loss': 0.1604, 'learning_rate': 5e-05, 'epoch': 3.0}\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'eval_loss': 0.10850954055786133, 'eval_accuracy': 0.9657407407407408, 'eval_runtime': 2.712, 'eval_samples_per_second': 12.537, 'eval_steps_per_second': 1.844, 'epoch': 3.0}\n",
"{'train_runtime': 401.2162, 'train_samples_per_second': 72.26, 'train_steps_per_second': 2.258, 'train_loss': 0.2405153699794879, 'epoch': 3.0}\n"
]
},
{
"data": {
"text/plain": [
"TrainOutput(global_step=906, training_loss=0.2405153699794879, metrics={'train_runtime': 401.2162, 'train_samples_per_second': 72.26, 'train_steps_per_second': 2.258, 'train_loss': 0.2405153699794879, 'epoch': 3.0})"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# start training\n",
"trainer.train()"
@@ -252,7 +477,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"tags": []
},
@@ -263,7 +488,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {
"tags": []
},
@@ -283,11 +508,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"inputs: '我 要 客观', predict: '中性' , label: '中性'\n",
"inputs: '靠 你 真是 说 废话 吗', predict: '消极' , label: '消极'\n",
"inputs: '口嗅 会', predict: '中性' , label: '中性'\n",
"inputs: '每次 是 表妹 带 窝 飞 因为 窝路痴', predict: '中性' , label: '中性'\n",
"inputs: '别说 废话 我 问 你 个 问题', predict: '消极' , label: '消极'\n",
"inputs: '4967 是 新加坡 那 家 银行', predict: '中性' , label: '中性'\n",
"inputs: '是 我 喜欢 兔子', predict: '积极' , label: '积极'\n",
"inputs: '你 写 过 黄山 奇石 吗', predict: '中性' , label: '中性'\n",
"inputs: '一个一个 慢慢来', predict: '中性' , label: '中性'\n",
"inputs: '我 玩 过 这个 一点 都 不 好玩', predict: '消极' , label: '消极'\n",
"inputs: '网上 开发 女孩 的 QQ', predict: '中性' , label: '中性'\n",
"inputs: '背 你 猜 对 了', predict: '中性' , label: '中性'\n",
"inputs: '我 讨厌 你 , 哼哼 哼 。 。', predict: '消极' , label: '消极'\n"
]
}
],
"source": [
"from mindspore import Tensor\n",
"\n",
@@ -297,11 +542,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"inputs: '家人们咱就是说一整个无语住了 绝绝子叠buff', predict: '中性'\n"
]
}
],
"source": [
"predict(\"家人们咱就是说一整个无语住了 绝绝子叠buff\")"
]
@@ -323,7 +576,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
}
},
"nbformat": 4,


+ 0
- 280
examples/classification/bert_graph_emotect_finetune.ipynb View File

@@ -1,280 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import mindspore\n",
"from mindspore.dataset import text, GeneratorDataset, transforms\n",
"from mindspore import nn, context\n",
"\n",
"from mindnlp.engine import Trainer, Evaluator\n",
"from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback\n",
"from mindnlp.metrics import Accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# prepare dataset\n",
"class SentimentDataset:\n",
" \"\"\"Sentiment Dataset\"\"\"\n",
"\n",
" def __init__(self, path):\n",
" self.path = path\n",
" self._labels, self._text_a = [], []\n",
" self._load()\n",
"\n",
" def _load(self):\n",
" with open(self.path, \"r\", encoding=\"utf-8\") as f:\n",
" dataset = f.read()\n",
" lines = dataset.split(\"\\n\")\n",
" for line in lines[1:-1]:\n",
" label, text_a = line.split(\"\\t\")\n",
" self._labels.append(int(label))\n",
" self._text_a.append(text_a)\n",
"\n",
" def __getitem__(self, index):\n",
" return self._labels[index], self._text_a[index]\n",
"\n",
" def __len__(self):\n",
" return len(self._labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# download dataset\n",
"!wget https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz -O emotion_detection.tar.gz\n",
"!tar xvf emotion_detection.tar.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def process_dataset(source, tokenizer, max_seq_len=64, batch_size=32, shuffle=True):\n",
" column_names = [\"label\", \"text_a\"]\n",
" \n",
" dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)\n",
" # transforms\n",
" type_cast_op = transforms.TypeCast(mindspore.int32)\n",
" def tokenize_and_pad(text):\n",
" tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)\n",
" return tokenized['input_ids'], tokenized['attention_mask']\n",
" # map dataset\n",
" dataset = dataset.map(operations=tokenize_and_pad, input_columns=\"text_a\", output_columns=['input_ids', 'attention_mask'])\n",
" dataset = dataset.map(operations=[type_cast_op], input_columns=\"label\", output_columns='labels')\n",
" # batch dataset\n",
" dataset = dataset.batch(batch_size)\n",
"\n",
" return dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from mindnlp.transformers import BertTokenizer\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dataset_train = process_dataset(SentimentDataset(\"data/train.tsv\"), tokenizer)\n",
"dataset_val = process_dataset(SentimentDataset(\"data/dev.tsv\"), tokenizer)\n",
"dataset_test = process_dataset(SentimentDataset(\"data/test.tsv\"), tokenizer, shuffle=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dataset_train.get_col_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from mindnlp.transformers import MSBertForSequenceClassification\n",
"\n",
"# set bert config and define parameters for training\n",
"model = MSBertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)\n",
"model.enable_recompute()\n",
"\n",
"loss_fn = nn.CrossEntropyLoss()\n",
"optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"metric = Accuracy()\n",
"# define callbacks to save checkpoints\n",
"ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='bert_emotect', epochs=1, keep_checkpoint_max=2)\n",
"best_model_cb = BestModelCallback(save_path='checkpoint', ckpt_name='bert_emotect_best', auto_load=True)\n",
"\n",
"trainer = Trainer(network=model, loss_fn=loss_fn, train_dataset=dataset_train,\n",
" eval_dataset=dataset_val, metrics=metric,\n",
" epochs=1, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],\n",
" jit=True)\n",
"\n",
"trainer.set_amp(level='O1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# start training\n",
"trainer.run(tgt_columns=\"labels\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)\n",
"evaluator.run(tgt_columns=\"labels\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dataset_infer = SentimentDataset(\"data/infer.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def predict(text, label=None):\n",
" label_map = {0: \"消极\", 1: \"中性\", 2: \"积极\"}\n",
"\n",
" text_tokenized = Tensor([tokenizer(text).input_ids])\n",
" logits = model(text_tokenized)\n",
" predict_label = logits[0].asnumpy().argmax()\n",
" info = f\"inputs: '{text}', predict: '{label_map[predict_label]}'\"\n",
" if label is not None:\n",
" info += f\" , label: '{label_map[label]}'\"\n",
" print(info)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from mindspore import Tensor\n",
"\n",
"for label, text in dataset_infer:\n",
" predict(text, label)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"predict(\"家人们咱就是说一整个无语住了 绝绝子叠buff\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 230
- 129
examples/classification/bert_imdb_finetune.ipynb View File

@@ -2,23 +2,35 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 0.938 seconds.\n",
"Prefix dict has been built successfully.\n"
]
}
],
"source": [
"import os\n",
"\n",
"import mindspore\n",
"from mindspore.dataset import text, GeneratorDataset, transforms\n",
"from mindspore import nn\n",
"\n",
"from mindnlp import load_dataset\n",
"from mindspore.dataset import transforms\n",
"\n",
"from mindnlp.engine import Trainer, Evaluator\n",
"from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback\n",
"from mindnlp.metrics import Accuracy"
"from mindnlp.engine import Trainer"
]
},
{
@@ -29,6 +41,8 @@
},
"outputs": [],
"source": [
"from mindnlp.dataset import load_dataset\n",
"\n",
"imdb_ds = load_dataset('imdb', split=['train', 'test'])\n",
"imdb_train = imdb_ds['train']\n",
"imdb_test = imdb_ds['test']"
@@ -61,6 +75,14 @@
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
@@ -87,8 +109,6 @@
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def process_dataset(dataset, tokenizer, max_seq_len=256, batch_size=32, shuffle=False):\n",
" is_ascend = mindspore.get_context('device_target') == 'Ascend'\n",
" def tokenize(text):\n",
@@ -145,13 +165,13 @@
"data": {
"text/plain": [
"[Tensor(shape=[32, 256], dtype=Int64, value=\n",
" [[ 101, 2091, 1233 ... 1133, 1103, 102],\n",
" [ 101, 1170, 3195 ... 0, 0, 0],\n",
" [ 101, 146, 1148 ... 0, 0, 0],\n",
" [[ 101, 1398, 178 ... 0, 0, 0],\n",
" [ 101, 1188, 1437 ... 0, 0, 0],\n",
" [ 101, 5145, 2568 ... 1116, 1132, 102],\n",
" ...\n",
" [ 101, 146, 1821 ... 0, 0, 0],\n",
" [ 101, 146, 1486 ... 0, 0, 0],\n",
" [ 101, 1258, 1515 ... 0, 0, 0]]),\n",
" [ 101, 3517, 125 ... 0, 0, 0],\n",
" [ 101, 146, 1541 ... 0, 0, 0],\n",
" [ 101, 1188, 1110 ... 0, 0, 0]]),\n",
" Tensor(shape=[32, 256], dtype=Int64, value=\n",
" [[0, 0, 0 ... 0, 0, 0],\n",
" [0, 0, 0 ... 0, 0, 0],\n",
@@ -161,15 +181,15 @@
" [0, 0, 0 ... 0, 0, 0],\n",
" [0, 0, 0 ... 0, 0, 0]]),\n",
" Tensor(shape=[32, 256], dtype=Int64, value=\n",
" [[1, 1, 1 ... 1, 1, 1],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [[1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 1, 1, 1],\n",
" ...\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0],\n",
" [1, 1, 1 ... 0, 0, 0]]),\n",
" Tensor(shape=[32], dtype=Int32, value= [0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, \n",
" 1, 0, 0, 1, 1, 0, 1, 0])]"
" Tensor(shape=[32], dtype=Int32, value= [1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, \n",
" 0, 0, 1, 0, 1, 0, 1, 1])]"
]
},
"execution_count": 8,
@@ -183,182 +203,263 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[MS_ALLOC_CONF]Runtime config: enable_vmm:True vmm_align_size:2MB\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following parameters in checkpoint files are not loaded:\n",
"['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
"The following parameters in models are missing parameter:\n",
"['classifier.weight', 'classifier.bias']\n"
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"from mindnlp.transformers import AutoModelForSequenceClassification\n",
"from mindspore.experimental.optim import Adam\n",
"\n",
"# set bert config and define parameters for training\n",
"model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)\n",
"\n",
"optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)\n",
"model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from mindnlp.engine import TrainingArguments\n",
"\n",
"metric = Accuracy()\n",
"training_args = TrainingArguments(\n",
" output_dir=\"bert_imdb_finetune\",\n",
" evaluation_strategy=\"epoch\",\n",
" save_strategy=\"epoch\",\n",
" logging_strategy=\"epoch\",\n",
" load_best_model_at_end=True,\n",
" num_train_epochs=3.0,\n",
" learning_rate=2e-5\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from mindnlp import evaluate\n",
"import numpy as np\n",
"\n",
"# define callbacks to save checkpoints\n",
"ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='bert_imdb_finetune', epochs=1, keep_checkpoint_max=2)\n",
"best_model_cb = BestModelCallback(save_path='checkpoint', ckpt_name='bert_imdb_finetune_best', auto_load=True)\n",
"metric = evaluate.load(\"accuracy\")\n",
"\n",
"trainer = Trainer(network=model, train_dataset=dataset_train,\n",
" eval_dataset=dataset_val, metrics=metric,\n",
" epochs=3, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],\n",
" jit=False)"
"def compute_metrics(eval_pred):\n",
" logits, labels = eval_pred\n",
" predictions = np.argmax(logits, axis=-1)\n",
" return metric.compute(predictions=predictions, references=labels)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=dataset_train,\n",
" eval_dataset=dataset_val,\n",
" compute_metrics=compute_metrics\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The train will start from the checkpoint saved in 'checkpoint'.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████| 547/547 [04:20<00:00, 2.10it/s, loss=0.32509542]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checkpoint: 'bert_imdb_finetune_epoch_0.ckpt' has been saved in epoch: 0.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 235/235 [00:40<00:00, 5.82it/s]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f3b06a2f5d244b09a880b0a68c18982c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1641 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate Score: {'Accuracy': 0.9309333333333333}\n",
"---------------Best Model: 'bert_imdb_finetune_best.ckpt' has been saved in epoch: 0.---------------\n"
"{'loss': 0.3206, 'learning_rate': 2e-05, 'epoch': 1.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 1: 100%|██████████████████████████████████████████████████████████████████████████████████| 547/547 [04:19<00:00, 2.11it/s, loss=0.19028783]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/235 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checkpoint: 'bert_imdb_finetune_epoch_1.ckpt' has been saved in epoch: 1.\n"
"{'eval_loss': 0.2645550072193146, 'eval_accuracy': 0.89, 'eval_runtime': 21.6942, 'eval_samples_per_second': 10.832, 'eval_steps_per_second': 1.383, 'epoch': 1.0}\n",
"{'loss': 0.1771, 'learning_rate': 2e-05, 'epoch': 2.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 235/235 [00:40<00:00, 5.80it/s]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/235 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate Score: {'Accuracy': 0.9690666666666666}\n",
"---------------Best Model: 'bert_imdb_finetune_best.ckpt' has been saved in epoch: 1.---------------\n"
"{'eval_loss': 0.28401243686676025, 'eval_accuracy': 0.9021333333333333, 'eval_runtime': 21.4019, 'eval_samples_per_second': 10.98, 'eval_steps_per_second': 1.402, 'epoch': 2.0}\n",
"{'loss': 0.1088, 'learning_rate': 2e-05, 'epoch': 3.0}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 2: 100%|███████████████████████████████████████████████████████████████████████████████████| 547/547 [04:19<00:00, 2.11it/s, loss=0.1257236]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/235 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The maximum number of stored checkpoints has been reached.\n",
"Checkpoint: 'bert_imdb_finetune_epoch_2.ckpt' has been saved in epoch: 2.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 235/235 [00:40<00:00, 5.78it/s]\n"
"{'eval_loss': 0.3973828852176666, 'eval_accuracy': 0.9024, 'eval_runtime': 21.3734, 'eval_samples_per_second': 10.995, 'eval_steps_per_second': 1.404, 'epoch': 3.0}\n",
"{'train_runtime': 882.004, 'train_samples_per_second': 59.537, 'train_steps_per_second': 1.861, 'train_loss': 0.20214975555810458, 'epoch': 3.0}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate Score: {'Accuracy': 0.9785333333333334}\n",
"---------------Best Model: 'bert_imdb_finetune_best.ckpt' has been saved in epoch: 2.---------------\n",
"Loading best model from 'checkpoint' with '['Accuracy']': [0.9785333333333334]...\n",
"---------------The model is already load the best model from 'bert_imdb_finetune_best.ckpt'.---------------\n"
]
"data": {
"text/plain": [
"TrainOutput(global_step=1641, training_loss=0.20214975555810458, metrics={'train_runtime': 882.004, 'train_samples_per_second': 59.537, 'train_steps_per_second': 1.861, 'train_loss': 0.20214975555810458, 'epoch': 3.0})"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer.run(tgt_columns=\"labels\")"
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"tags": []
},
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 782/782 [02:15<00:00, 5.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate Score: {'Accuracy': 0.91264}\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1b1c9c5dec1f4ee3b12cbf3019b8ba63",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/782 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
"data": {
"text/plain": [
"{'eval_loss': 0.25377440452575684,\n",
" 'eval_accuracy': 0.89504,\n",
" 'eval_runtime': 70.9269,\n",
" 'eval_samples_per_second': 11.025,\n",
" 'eval_steps_per_second': 1.382,\n",
" 'epoch': 3.0}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)\n",
"evaluator.run(tgt_columns=\"labels\")"
"trainer.evaluate(dataset_test)"
]
},
{
@@ -385,7 +486,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
}
},
"nbformat": 4,


+ 97
- 187
examples/text_generation/gpt2_summarization.ipynb View File

@@ -11,8 +11,27 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" setattr(self, word, getattr(machar, word).flat[0])\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
" return self._float_to_str(self.smallest_subnormal)\n",
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 0.939 seconds.\n",
"Prefix dict has been built successfully.\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/Cython/Compiler/Main.py:381: FutureWarning: Cython directive 'language_level' not set, using '3str' for now (Py3). This has changed from earlier releases! File: /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/models/graphormer/algos_graphormer.pyx\n",
" tree = Parsing.p_module(s, pxd, full_module_name)\n",
"In file included from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929,\n",
" from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,\n",
" from /home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/arrayobject.h:5,\n",
" from /home/lvyufeng/.pyxbld/temp.linux-aarch64-cpython-39/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/models/graphormer/algos_graphormer.c:1240:\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:17:2: warning: #warning \"Using deprecated NumPy API, disable it with \" \"#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION\" [-Wcpp]\n",
" 17 | #warning \"Using deprecated NumPy API, disable it with \" \\\n",
" | ^~~~~~~\n"
]
}
],
@@ -113,13 +132,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/urllib3/connectionpool.py:1100: InsecureRequestWarning: Unverified HTTPS request is being made to host 'modelscope.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
" warnings.warn(\n",
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/urllib3/connectionpool.py:1100: InsecureRequestWarning: Unverified HTTPS request is being made to host 'modelscope.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
" warnings.warn(\n",
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/urllib3/connectionpool.py:1100: InsecureRequestWarning: Unverified HTTPS request is being made to host 'modelscope.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
" warnings.warn(\n",
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/urllib3/connectionpool.py:1100: InsecureRequestWarning: Unverified HTTPS request is being made to host 'modelscope.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
"/home/lvyufeng/miniconda3/envs/mindspore/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n"
]
}
@@ -200,7 +213,7 @@
"metadata": {},
"outputs": [],
"source": [
"from mindspore import ops\n",
"from mindnlp.core.nn import functional as F\n",
"from mindnlp.transformers import GPT2LMHeadModel\n",
"\n",
"class GPT2ForSummarization(GPT2LMHeadModel):\n",
@@ -210,11 +223,11 @@
" attention_mask = None,\n",
" labels = None,\n",
" ):\n",
" outputs = super().construct(input_ids=input_ids, attention_mask=attention_mask)\n",
" outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask)\n",
" shift_logits = outputs.logits[..., :-1, :]\n",
" shift_labels = labels[..., 1:]\n",
" # Flatten the tokens\n",
" loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1), ignore_index=tokenizer.pad_token_id)\n",
" loss = F.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1), ignore_index=tokenizer.pad_token_id)\n",
" return loss"
]
},
@@ -223,62 +236,40 @@
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from mindspore import ops\n",
"from mindspore.nn.learning_rate_schedule import LearningRateSchedule\n",
"\n",
"class LinearWithWarmUp(LearningRateSchedule):\n",
" \"\"\"\n",
" Warmup-decay learning rate.\n",
" \"\"\"\n",
" def __init__(self, learning_rate, num_warmup_steps, num_training_steps):\n",
" super().__init__()\n",
" self.learning_rate = learning_rate\n",
" self.num_warmup_steps = num_warmup_steps\n",
" self.num_training_steps = num_training_steps\n",
"\n",
" def construct(self, global_step):\n",
" if global_step < self.num_warmup_steps:\n",
" return global_step / float(max(1, self.num_warmup_steps)) * self.learning_rate\n",
" return ops.maximum(\n",
" 0.0, (self.num_training_steps - global_step) / (max(1, self.num_training_steps - self.num_warmup_steps))\n",
" ) * self.learning_rate"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"num_epochs = 5\n",
"warmup_steps = 2000\n",
"learning_rate = 1.5e-4\n",
"\n",
"warmup_steps = 2000\n",
"max_grad_norm = 1.0\n",
"num_training_steps = num_epochs * train_dataset.get_dataset_size()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[MS_ALLOC_CONF]Runtime config: enable_vmm:True vmm_align_size:2MB\n"
]
}
],
"source": [
"from mindspore import nn\n",
"from mindnlp.transformers import GPT2Config, GPT2LMHeadModel\n",
"\n",
"config = GPT2Config(vocab_size=len(tokenizer))\n",
"model = GPT2ForSummarization(config)\n",
"\n",
"lr_scheduler = LinearWithWarmUp(learning_rate=learning_rate, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)\n",
"optimizer = nn.AdamWeightDecay(model.trainable_params(), learning_rate=lr_scheduler)"
"model = GPT2ForSummarization(config)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {
"tags": []
},
@@ -296,128 +287,85 @@
"print('number of model parameters: {}'.format(model.num_parameters()))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from mindnlp.engine import TrainingArguments\n",
"\n",
"training_args = TrainingArguments(\n",
" output_dir=\"gpt2_summarization\",\n",
" save_steps=train_dataset.get_dataset_size(),\n",
" save_total_limit=3,\n",
" logging_steps=100,\n",
" max_steps=num_training_steps,\n",
" learning_rate=learning_rate,\n",
" max_grad_norm=max_grad_norm,\n",
" warmup_steps=warmup_steps\n",
" \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Trainer will use 'StaticLossScaler' with `scale_value=2 ** 10` when `loss_scaler` is None.\n"
]
}
],
"outputs": [],
"source": [
"from mindnlp.engine import Trainer\n",
"from mindnlp.engine.callbacks import CheckpointCallback\n",
"\n",
"ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='gpt2_summarization',\n",
" epochs=1, keep_checkpoint_max=2)\n",
"\n",
"trainer = Trainer(network=model, train_dataset=train_dataset,\n",
" epochs=5, optimizer=optimizer, callbacks=ckpoint_cb)\n",
"trainer.set_amp(level='O1')"
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The train will start from the checkpoint saved in 'checkpoint'.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 0: 0%| | 0/11250 [00:00<?, ?it/s][ERROR] CORE(1809837,7f95ef9896c0,python):2023-11-26-01:46:08.184.627 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1809837/729418715.py]\n",
"[ERROR] CORE(1809837,7f95ef9896c0,python):2023-11-26-01:46:08.184.657 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1809837/729418715.py]\n",
"[ERROR] CORE(1809837,7f95ef9896c0,python):2023-11-26-01:46:08.184.892 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1809837/729418715.py]\n",
"[ERROR] CORE(1809837,7f95ef9896c0,python):2023-11-26-01:46:08.184.904 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1809837/729418715.py]\n",
"Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████| 11250/11250 [1:23:37<00:00, 2.24it/s, loss=4.937714]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checkpoint: 'gpt2_summarization_epoch_0.ckpt' has been saved in epoch: 0.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████| 11250/11250 [1:23:40<00:00, 2.24it/s, loss=4.7891397]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checkpoint: 'gpt2_summarization_epoch_1.ckpt' has been saved in epoch: 1.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 2: 100%|████████████████████████████████████████████████████████████████████████████| 11250/11250 [1:23:44<00:00, 2.24it/s, loss=4.789195]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The maximum number of stored checkpoints has been reached.\n",
"Checkpoint: 'gpt2_summarization_epoch_2.ckpt' has been saved in epoch: 2.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 3: 100%|████████████████████████████████████████████████████████████████████████████| 11250/11250 [1:23:36<00:00, 2.24it/s, loss=4.789144]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The maximum number of stored checkpoints has been reached.\n",
"Checkpoint: 'gpt2_summarization_epoch_3.ckpt' has been saved in epoch: 3.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch 4: 100%|███████████████████████████████████████████████████████████████████████████| 11250/11250 [1:23:36<00:00, 2.24it/s, loss=4.7891893]\n"
]
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bf8747a13d7b4e18a9d26fbca4d900e1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/56250 …"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The maximum number of stored checkpoints has been reached.\n",
"Checkpoint: 'gpt2_summarization_epoch_4.ckpt' has been saved in epoch: 4.\n"
"{'loss': 6.8268, 'learning_rate': 7.499999999999999e-06, 'epoch': 0.01}\n",
"{'loss': 6.0453, 'learning_rate': 1.4999999999999999e-05, 'epoch': 0.02}\n",
"{'loss': 5.2041, 'learning_rate': 2.2499999999999998e-05, 'epoch': 0.03}\n",
"{'loss': 4.7843, 'learning_rate': 2.9999999999999997e-05, 'epoch': 0.04}\n",
"{'loss': 4.4802, 'learning_rate': 3.75e-05, 'epoch': 0.04}\n",
"{'loss': 4.154, 'learning_rate': 4.4999999999999996e-05, 'epoch': 0.05}\n",
"{'loss': 3.974, 'learning_rate': 5.2499999999999995e-05, 'epoch': 0.06}\n",
"{'loss': 3.838, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.07}\n",
"{'loss': 3.8311, 'learning_rate': 6.75e-05, 'epoch': 0.08}\n",
"{'loss': 3.8637, 'learning_rate': 7.5e-05, 'epoch': 0.09}\n"
]
}
],
"source": [
"trainer.run(tgt_columns=\"labels\")"
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {
"tags": []
},
@@ -442,7 +390,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {
"tags": []
},
@@ -453,58 +401,20 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[array([[ 101, 3173, 1290, 5381, 6205, 2128, 128, 3299, 129, 3189, 4510,\n",
" 8020, 6381, 5442, 6948, 1132, 840, 8021, 6381, 5442, 129, 3189,\n",
" 678, 1286, 794, 7362, 6205, 4209, 689, 1265, 2339, 7415, 1730,\n",
" 749, 6237, 1168, 8024, 754, 128, 3299, 127, 3189, 7506, 1814,\n",
" 3433, 3409, 1790, 759, 678, 752, 3125, 704, 6158, 1737, 4638,\n",
" 125, 1399, 4771, 2339, 2347, 4802, 6371, 1059, 6956, 3647, 767,\n",
" 511, 128, 3299, 127, 3189, 1119, 3247, 125, 3198, 8123, 1146,\n",
" 8024, 7362, 6205, 4689, 7506, 1814, 3433, 3409, 1790, 4209, 4771,\n",
" 2963, 6822, 676, 7339, 1762, 4209, 4771, 122, 1384, 759, 1298,\n",
" 7023, 1277, 671, 2339, 868, 7481, 6822, 6121, 868, 689, 3198,\n",
" 1355, 4495, 671, 6629, 4209, 680, 4482, 3172, 4960, 1139, 752,\n",
" 3125, 8024, 125, 1399, 4771, 2339, 6158, 1737, 511, 3131, 3001,\n",
" 782, 1447, 754, 127, 3189, 678, 1286, 1355, 4385, 671, 1399,\n",
" 6158, 1737, 4771, 2339, 8024, 5307, 1059, 1213, 2843, 3131, 3187,\n",
" 3126, 3647, 767, 511, 5307, 3131, 2844, 782, 1447, 1059, 1213,\n",
" 3131, 3001, 8024, 3297, 1400, 671, 1399, 6878, 7410, 4771, 2339,\n",
" 6890, 860, 2347, 754, 129, 3189, 8110, 3198, 8216, 1146, 2823,\n",
" 1168, 511, 5635, 3634, 8024, 125, 1399, 6878, 7410, 4771, 2339,\n",
" 6890, 860, 1059, 6956, 2823, 1168, 1285, 759, 8024, 3131, 3001,\n",
" 2339, 868, 5310, 3338, 511, 4771, 3175, 3633, 1762, 976, 1587,\n",
" 1400, 2339, 868, 511, 102]], dtype=int64), array(['渭南韩城县桑树坪井下事故中被困4名矿工全部死亡,遗体今日中午均被找到。'], dtype='<U35')]\n"
]
}
],
"outputs": [],
"source": [
"print(next(batched_test_dataset.create_tuple_iterator(output_numpy=True)))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/daiyuxin/anaconda3/envs/mindspore/lib/python3.9/site-packages/urllib3/connectionpool.py:1100: InsecureRequestWarning: Unverified HTTPS request is being made to host 'modelscope.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n",
" warnings.warn(\n",
"Generation config file not found, using a generation config created from the model config.\n"
]
}
],
"outputs": [],
"source": [
"model = GPT2LMHeadModel.from_pretrained('./checkpoint/gpt2_summarization_epoch_4.ckpt', config=config)"
]
@@ -553,7 +463,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
}
},
"nbformat": 4,


+ 30
- 0
llm/inference/chatglm4/simple_inference.py View File

@@ -0,0 +1,30 @@
import mindspore
from mindnlp.core import no_grad
from mindnlp.transformers import AutoModelForCausalLM, AutoTokenizer
from mindspore._c_expression import _framework_profiler_step_start
from mindspore._c_expression import _framework_profiler_step_end

tokenizer = AutoTokenizer.from_pretrained("ZhipuAI/glm-4-9b-chat", mirror='modelscope')

query = "你好"

inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}],
add_generation_prompt=True,
tokenize=True,
return_tensors="ms",
return_dict=True
)
print(inputs)
model = AutoModelForCausalLM.from_pretrained(
"ZhipuAI/glm-4-9b-chat",
mirror='modelscope',
ms_dtype=mindspore.float16,
).eval()

# _framework_profiler_step_start()
gen_kwargs = {"max_length": 100, "do_sample": True, "top_k": 1}
with no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# _framework_profiler_step_end()

llm/inference/llama2/download.sh → llm/inference/llama2/origin_llama/download.sh View File


llm/inference/llama2/example_chat_completion.py → llm/inference/llama2/origin_llama/example_chat_completion.py View File


llm/inference/llama2/example_text_completion.py → llm/inference/llama2/origin_llama/example_text_completion.py View File


llm/inference/llama2/llama/__init__.py → llm/inference/llama2/origin_llama/llama/__init__.py View File


llm/inference/llama2/llama/generation.py → llm/inference/llama2/origin_llama/llama/generation.py View File


llm/inference/llama2/llama/model.py → llm/inference/llama2/origin_llama/llama/model.py View File


llm/inference/llama2/llama/tokenizer.py → llm/inference/llama2/origin_llama/llama/tokenizer.py View File


+ 54
- 0
llm/inference/llama2/simple_inference_with_static_cache.py View File

@@ -0,0 +1,54 @@
import mindspore
from mindnlp.transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
from mindnlp.utils.testing_utils import CaptureLogger
from mindnlp.core import ops, no_grad

prompts = [
"Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.",
]

NUM_TOKENS_TO_GENERATE = 40

model_id = 'shakechen/llama-2-7b-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_id, mirror='modelscope', pad_token="</s>", padding_side="right")
model = LlamaForCausalLM.from_pretrained(model_id, mirror='modelscope')
inputs = tokenizer(prompts, return_tensors="ms", padding=True)

def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
logits = model(
cur_token,
position_ids=input_pos,
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False,
use_cache=True
)[0]
new_token = ops.argmax(logits[:, -1], dim=-1)[:, None]
return new_token

batch_size, seq_length = inputs["input_ids"].shape
with no_grad():
past_key_values = StaticCache(
config=model.config, max_batch_size=2, max_cache_len=1024, dtype=model.dtype
)
cache_position = ops.arange(seq_length)
generated_ids = ops.zeros(
batch_size, seq_length + NUM_TOKENS_TO_GENERATE + 1, dtype=mindspore.int32
)
generated_ids[:, cache_position] = inputs["input_ids"].to(mindspore.int32)

logits = model(
**inputs, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True
)[0]
next_token = ops.argmax(logits[:, -1], dim=-1)[:, None]
generated_ids[:, seq_length] = next_token[:, 0]

cache_position = mindspore.tensor([seq_length + 1])
for _ in range(1, NUM_TOKENS_TO_GENERATE):
next_token = decode_one_tokens(model, next_token, None, cache_position, past_key_values)
generated_ids[:, cache_position] = next_token.int()
cache_position += 1

text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(text)

+ 0
- 1
llm/inference/llama3/run_llama3.py View File

@@ -3,7 +3,6 @@ from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
from mindspore._c_expression import _framework_profiler_step_start
from mindspore._c_expression import _framework_profiler_step_end


model_id = "LLM-Research/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, mirror='modelscope')


+ 4
- 0
mindnlp/configs.py View File

@@ -18,9 +18,13 @@ Global configs
import os
from packaging import version
import mindspore
from mindspore._c_expression import MSContext # pylint: disable=no-name-in-module, import-error

SOC = MSContext.get_instance().get_ascend_soc_version()
DEVICE_TARGET = mindspore.get_context('device_target')
GENERATOR_SEED = version.parse(mindspore.__version__) >= version.parse('2.3.0')
SUPPORT_VIEW = GENERATOR_SEED
SUPPORT_BF16 = GENERATOR_SEED and '910b' in SOC
USE_PYBOOST = version.parse(mindspore.__version__) >= version.parse('2.3.0') and DEVICE_TARGET == 'Ascend'
DEFAULT_DTYPE = mindspore.float32



+ 1
- 20
mindnlp/core/__init__.py View File

@@ -13,25 +13,6 @@
# limitations under the License.
# ============================================================================
"""core module"""
import contextlib
import mindspore
from mindspore.common.api import _pynative_executor
from . import optim, ops, nn
from .utils import get_default_dtype, set_default_dtype


class no_grad(contextlib.ContextDecorator):
"""
Context Manager to disable gradient calculation. When enter this context, we will disable calculate
gradient. When exit this context, we will resume its prev state.
Currently, it can only use in Pynative mode. It also can be used as decorator.
"""

def __enter__(self):
if mindspore.get_context("mode") == mindspore.GRAPH_MODE:
raise RuntimeError("For no_grad feature, currently only support Pynative mode, but got Graph mode.")
_pynative_executor.set_enable_grad(False)

def __exit__(self, exc_type, exc_val, exc_tb):
_pynative_executor.set_enable_grad(True)
return False
from .autograd import no_grad

+ 2
- 0
mindnlp/core/autograd/__init__.py View File

@@ -0,0 +1,2 @@
"""autograd"""
from .grad_mode import no_grad

+ 34
- 0
mindnlp/core/autograd/grad_mode.py View File

@@ -0,0 +1,34 @@
# Copyright 2024 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""core module"""
import contextlib
import mindspore
from mindspore.common.api import _pynative_executor

class no_grad(contextlib.ContextDecorator):
"""
Context Manager to disable gradient calculation. When enter this context, we will disable calculate
gradient. When exit this context, we will resume its prev state.
Currently, it can only use in Pynative mode. It also can be used as decorator.
"""

def __enter__(self):
if mindspore.get_context("mode") == mindspore.GRAPH_MODE:
raise RuntimeError("For no_grad feature, currently only support Pynative mode, but got Graph mode.")
_pynative_executor.set_enable_grad(False)

def __exit__(self, exc_type, exc_val, exc_tb):
_pynative_executor.set_enable_grad(True)
return False

+ 17
- 2
mindnlp/core/nn/functional.py View File

@@ -55,6 +55,9 @@ def softplus(input, beta=1, threshold=20):
return mindspore.mint.nn.functional.softplus(input, beta, threshold)
return ops.softplus(input, beta, threshold)

def logsigmoid(input):
return ops.logsigmoid(input)

def leaky_relu(input, alpha=0.2):
if USE_PYBOOST:
return mindspore.mint.nn.functional.leaky_relu(input, alpha)
@@ -172,8 +175,11 @@ def binary_cross_entropy_with_logits(input, target, weight=None, reduction='mean
return mindspore.mint.nn.functional.binary_cross_entropy_with_logits(input, target, weight, reduction, pos_weight)
return ops.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction)

def log_softmax(input, dim=-1):
return ops.log_softmax(input, dim)
def log_softmax(input, dim=-1, dtype=None):
out = ops.log_softmax(input, dim)
if dtype is not None:
out = out.to(dtype)
return out

def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False):
if USE_PYBOOST:
@@ -195,6 +201,11 @@ def pad(input, pad, mode='constant', value=0.0):
return ops.pad(input, pad, mode)
return ops.pad(input, pad, mode, value)

def nll_loss(input, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
# _nll_loss = _get_cache_prim(ops.NLLLoss)(reduction, ignore_index)
# return _nll_loss(input, target, weight)
return ops.nll_loss(input, target, weight, ignore_index, reduction, label_smoothing)

def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
return ops.cross_entropy(input, target, weight, ignore_index, reduction, label_smoothing)

@@ -212,6 +223,10 @@ def softmax(input, dim=-1, *, dtype=None):
return ops.softmax(input, dim, dtype=dtype)

def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
if weight is None:
weight = ops.ones(normalized_shape, dtype=input.dtype)
if bias is None:
bias = ops.zeros(normalized_shape, dtype=input.dtype)
if USE_PYBOOST:
return mindspore.mint.layer_norm(input, normalized_shape, weight, bias, eps)
if weight is not None:


+ 4
- 4
mindnlp/core/nn/modules/activation.py View File

@@ -452,12 +452,12 @@ class MultiheadAttention(Module):
# make sure that the transpose op does not affect the "is" property
if key is value:
if query is key:
query = key = value = query.transpose(1, 0)
query = key = value = ops.transpose(query, 1, 0)
else:
query, key = (x.transpose(1, 0) for x in (query, key))
query, key = (ops.transpose(x, 1, 0) for x in (query, key))
value = key
else:
query, key, value = (x.transpose(1, 0) for x in (query, key, value))
query, key, value = (ops.transpose(x, 1, 0) for x in (query, key, value))

if not self._qkv_same_embed_dim:
attn_output, attn_output_weights = F.multi_head_attention_forward(
@@ -486,7 +486,7 @@ class MultiheadAttention(Module):
average_attn_weights=average_attn_weights,
is_causal=is_causal)
if self.batch_first and is_batched:
return attn_output.transpose(1, 0), attn_output_weights
return ops.transpose(attn_output, 1, 0), attn_output_weights
else:
return attn_output, attn_output_weights



+ 5
- 1
mindnlp/core/nn/modules/conv.py View File

@@ -182,11 +182,15 @@ class Conv1d(_ConvNd):
in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
False, _single(0), groups, bias, padding_mode, **factory_kwargs)

pad_mode = 'pad'
pad_mode = 'valid'
pad = padding
if isinstance(padding, tuple):
if padding[0] != 0:
pad_mode = 'pad'
pad = (0, 0, padding[0], padding[0])
elif isinstance(padding, int):
if padding != 0:
pad_mode = 'pad'
pad = (0, 0) + (padding,) * 2
if not isinstance(padding, (int, tuple)):
pad_mode = padding


+ 163
- 4
mindnlp/core/nn/modules/module.py View File

@@ -30,6 +30,14 @@ _global_module_registration_hooks: Dict[int, Callable] = OrderedDict()
_global_parameter_registration_hooks: Dict[int, Callable] = OrderedDict()


_global_backward_pre_hooks: Dict[int, Callable] = OrderedDict()
_global_backward_hooks: Dict[int, Callable] = OrderedDict()
_global_is_full_backward_hook: Optional[bool] = None
_global_forward_pre_hooks: Dict[int, Callable] = OrderedDict()
_global_forward_hooks: Dict[int, Callable] = OrderedDict()
_global_forward_hooks_always_called: Dict[int, bool] = OrderedDict()


class Module:
r"""Base class for all neural network modules.

@@ -326,14 +334,165 @@ class Module:
fn(self)
return self

def __call__(self, *input, **kwargs):
result = self.forward(*input, **kwargs)
return result
def _wrapped_call_impl(self, *args, **kwargs):
return self._call_impl(*args, **kwargs)

# torchrec tests the code consistency with the following code
# fmt: off
def _call_impl(self, *args, **kwargs):
forward_call = self.forward
# If we don't have any hooks, we want to skip the rest of the logic in
# this function, and just call forward.
if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
or _global_backward_pre_hooks or _global_backward_hooks
or _global_forward_hooks or _global_forward_pre_hooks):
return forward_call(*args, **kwargs)

try:
result = None
called_always_called_hooks = set()

full_backward_hooks, non_full_backward_hooks = [], []
backward_pre_hooks = []
if self._backward_pre_hooks or _global_backward_pre_hooks:
backward_pre_hooks = self._get_backward_pre_hooks()

if self._backward_hooks or _global_backward_hooks:
full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()

if _global_forward_pre_hooks or self._forward_pre_hooks:
for hook_id, hook in (
*_global_forward_pre_hooks.items(),
*self._forward_pre_hooks.items(),
):
if hook_id in self._forward_pre_hooks_with_kwargs:
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
if args_kwargs_result is not None:
if isinstance(args_kwargs_result, tuple) and len(args_kwargs_result) == 2:
args, kwargs = args_kwargs_result
else:
raise RuntimeError(
"forward pre-hook must return None or a tuple "
f"of (new_args, new_kwargs), but got {args_kwargs_result}."
)
else:
args_result = hook(self, args)
if args_result is not None:
if not isinstance(args_result, tuple):
args_result = (args_result,)
args = args_result

bw_hook = None
# if full_backward_hooks or backward_pre_hooks:
# bw_hook = BackwardHook(self, full_backward_hooks, backward_pre_hooks)
# args = bw_hook.setup_input_hook(args)

result = forward_call(*args, **kwargs)
if _global_forward_hooks or self._forward_hooks:
for hook_id, hook in (
*_global_forward_hooks.items(),
*self._forward_hooks.items(),
):
# mark that always called hook is run
if hook_id in self._forward_hooks_always_called or hook_id in _global_forward_hooks_always_called:
called_always_called_hooks.add(hook_id)

if hook_id in self._forward_hooks_with_kwargs:
hook_result = hook(self, args, kwargs, result)
else:
hook_result = hook(self, args, result)

if hook_result is not None:
result = hook_result

if bw_hook:
if not isinstance(result, (mindspore.Tensor, tuple)):
warnings.warn("For backward hooks to be called,"
" module output should be a Tensor or a tuple of Tensors"
f" but received {type(result)}")
result = bw_hook.setup_output_hook(result)

# Handle the non-full backward hooks
if non_full_backward_hooks:
var = result
while not isinstance(var, mindspore.Tensor):
if isinstance(var, dict):
var = next(v for v in var.values() if isinstance(v, mindspore.Tensor))
else:
var = var[0]
# grad_fn = var.grad_fn
# if grad_fn is not None:
# for hook in non_full_backward_hooks:
# grad_fn.register_hook(_WrappedHook(hook, self))
# self._maybe_warn_non_full_backward_hook(args, result, grad_fn)

return result

except Exception:
# run always called hooks if they have not already been run
# For now only forward hooks have the always_call option but perhaps
# this functionality should be added to full backward hooks as well.
for hook_id, hook in _global_forward_hooks.items():
if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks: # type: ignore[possibly-undefined]
try:
hook_result = hook(self, args, result) # type: ignore[possibly-undefined]
if hook_result is not None:
result = hook_result
except Exception as e:
warnings.warn("global module forward hook with ``always_call=True`` raised an exception "
f"that was silenced as another error was raised in forward: {str(e)}")
continue

for hook_id, hook in self._forward_hooks.items():
if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks: # type: ignore[possibly-undefined]
try:
if hook_id in self._forward_hooks_with_kwargs:
hook_result = hook(self, args, kwargs, result) # type: ignore[possibly-undefined]
else:
hook_result = hook(self, args, result) # type: ignore[possibly-undefined]
if hook_result is not None:
result = hook_result
except Exception as e:
warnings.warn("module forward hook with ``always_call=True`` raised an exception "
f"that was silenced as another error was raised in forward: {str(e)}")
continue
# raise exception raised in try block
raise
# fmt: on

__call__: Callable[..., Any] = _wrapped_call_impl

def __getstate__(self):
state = self.__dict__.copy()
state.pop("_compiled_call_impl", None)
return state

def __setstate__(self, state):
self.__dict__.update(state)
if '_forward_pre_hooks' not in self.__dict__:

# Support loading old checkpoints that don't have the following attrs:
if "_forward_pre_hooks" not in self.__dict__:
self._forward_pre_hooks = OrderedDict()
if "_forward_pre_hooks_with_kwargs" not in self.__dict__:
self._forward_pre_hooks_with_kwargs = OrderedDict()
if "_forward_hooks_with_kwargs" not in self.__dict__:
self._forward_hooks_with_kwargs = OrderedDict()
if "_forward_hooks_always_called" not in self.__dict__:
self._forward_hooks_always_called = OrderedDict()
if "_state_dict_hooks" not in self.__dict__:
self._state_dict_hooks = OrderedDict()
if "_state_dict_pre_hooks" not in self.__dict__:
self._state_dict_pre_hooks = OrderedDict()
if "_load_state_dict_pre_hooks" not in self.__dict__:
self._load_state_dict_pre_hooks = OrderedDict()
if "_load_state_dict_post_hooks" not in self.__dict__:
self._load_state_dict_post_hooks = OrderedDict()
if "_non_persistent_buffers_set" not in self.__dict__:
self._non_persistent_buffers_set = set()
if "_is_full_backward_hook" not in self.__dict__:
self._is_full_backward_hook = None
if "_backward_pre_hooks" not in self.__dict__:
self._backward_pre_hooks = OrderedDict()

def __getattr__(self, name):
if '_parameters' in self.__dict__:


+ 13
- 8
mindnlp/core/nn/modules/normalization.py View File

@@ -72,18 +72,23 @@ class LayerNorm(Module):
self.normalized_shape = tuple(normalized_shape)
self.eps = eps
self.elementwise_affine = elementwise_affine
self.weight = Parameter(ops.empty(self.normalized_shape, **factory_kwargs), 'weight', elementwise_affine)
if bias:
self.bias = Parameter(ops.empty(self.normalized_shape, **factory_kwargs), 'bias', elementwise_affine)
if self.elementwise_affine:
self.weight = Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
if bias:
self.bias = Parameter(ops.empty(self.normalized_shape, **factory_kwargs))
else:
self.register_parameter('bias', None)
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)

self.reset_parameters()

def reset_parameters(self) -> None:
init.ones_(self.weight)
if self.bias is not None:
init.zeros_(self.bias)
if self.elementwise_affine:
init.ones_(self.weight)
if self.bias is not None:
init.zeros_(self.bias)

def forward(self, input):
return layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
@@ -141,8 +146,8 @@ class GroupNorm(Module):
self.num_channels = num_channels
self.eps = eps
self.affine = affine
self.weight = Parameter(ops.empty(self.normalized_shape, **factory_kwargs), 'weight', affine)
self.bias = Parameter(ops.empty(self.normalized_shape, **factory_kwargs), 'bias', affine)
self.weight = Parameter(ops.empty(num_channels, **factory_kwargs), 'weight', affine)
self.bias = Parameter(ops.empty(num_channels, **factory_kwargs), 'bias', affine)

def forward(self, input):
return group_norm(input, self.num_groups, self.weight, self.bias, self.eps)


+ 3
- 0
mindnlp/core/nn/utils/__init__.py View File

@@ -0,0 +1,3 @@
"""utils"""
from .weight_norm import *
from .clip_grad import *

+ 120
- 0
mindnlp/core/nn/utils/clip_grad.py View File

@@ -0,0 +1,120 @@
"""clip grad"""
# mypy: allow-untyped-defs
import functools
from typing import Union, Iterable, Optional
from typing_extensions import deprecated

import mindspore
from ... import ops
from ...autograd import no_grad

_tensor_or_tensors = Union[mindspore.Tensor, Iterable[mindspore.Tensor]]

__all__ = ['clip_grad_norm_', 'clip_grad_norm', 'clip_grad_value_']

inf = float('inf')

def _no_grad(func):
"""
This wrapper is needed to avoid a circular import when using @no_grad on the exposed functions
clip_grad_norm_ and clip_grad_value_ themselves.
"""
def _no_grad_wrapper(*args, **kwargs):
with no_grad():
return func(*args, **kwargs)
functools.update_wrapper(_no_grad_wrapper, func)
return _no_grad_wrapper


@_no_grad
def clip_grad_norm_(
gradients: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,
error_if_nonfinite: bool = False, foreach: Optional[bool] = None) -> mindspore.Tensor:
r"""Clip the gradient norm of an iterable of parameters.

The norm is computed over all gradients together, as if they were
concatenated into a single vector. Gradients are modified in-place.

Args:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float): max norm of the gradients
norm_type (float): type of the used p-norm. Can be ``'inf'`` for
infinity norm.
error_if_nonfinite (bool): if True, an error is thrown if the total
norm of the gradients from :attr:`parameters` is ``nan``,
``inf``, or ``-inf``. Default: False (will switch to True in the future)
foreach (bool): use the faster foreach-based implementation.
If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
fall back to the slow implementation for other device types.
Default: ``None``

Returns:
Total norm of the parameter gradients (viewed as a single vector).
"""
grads = gradients
max_norm = float(max_norm)
norm_type = float(norm_type)
if len(grads) == 0:
return mindspore.tensor(0.)
if norm_type == inf:
norms = [g.abs().max() for g in grads]
total_norm = norms[0] if len(norms) == 1 else ops.max(ops.stack(norms))
else:
total_norm = ops.norm(ops.stack([ops.norm(g, norm_type) for g in grads]), norm_type)
if error_if_nonfinite and ops.logical_or(total_norm.isnan(), total_norm.isinf()):
raise RuntimeError(
f'The total norm of order {norm_type} for gradients from '
'`parameters` is non-finite, so it cannot be clipped. To disable '
'this error and scale the gradients by the non-finite norm anyway, '
'set `error_if_nonfinite=False`')
clip_coef = max_norm / (total_norm + 1e-6)
# Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
# avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
# when the gradients do not reside in CPU memory.
clip_coef_clamped = ops.clamp(clip_coef, max=1.0)
for g in grads:
ops.assign(g, ops.mul(g, clip_coef_clamped))
return total_norm



@deprecated(
"`nn.utils.clip_grad_norm` is now deprecated "
"in favor of `nn.utils.clip_grad_norm_`.",
category=FutureWarning,
)
def clip_grad_norm(
parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.,
error_if_nonfinite: bool = False, foreach: Optional[bool] = None) -> mindspore.Tensor:
r"""Clip the gradient norm of an iterable of parameters.

.. warning::
This method is now deprecated in favor of
:func:`nn.utils.clip_grad_norm_`.
"""
return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach)




@_no_grad
def clip_grad_value_(gradients: _tensor_or_tensors, clip_value: float, foreach: Optional[bool] = None) -> None:
r"""Clip the gradients of an iterable of parameters at specified value.

Gradients are modified in-place.

Args:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
clip_value (float): maximum allowed value of the gradients.
The gradients are clipped in the range
:math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
foreach (bool): use the faster foreach-based implementation
If ``None``, use the foreach implementation for CUDA and CPU native tensors and
silently fall back to the slow implementation for other device types.
Default: ``None``
"""
clip_value = float(clip_value)
for grad in gradients:
ops.assign(grad, ops.clamp(grad, -clip_value, clip_value))

mindnlp/core/nn/utils.py → mindnlp/core/nn/utils/weight_norm.py View File

@@ -15,8 +15,8 @@
r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
from typing import Any, TypeVar
from mindspore import Parameter
from .modules import Module
from .. import ops
from ..modules import Module
from ... import ops

__all__ = ['WeightNorm', 'weight_norm', 'remove_weight_norm']

@@ -93,6 +93,7 @@ class WeightNorm:
module.register_forward_pre_hook(fn)

return fn

def wrapper_func(self, cell, func):
r"""
wrapper_func where used to transpose cell_id to cell

+ 17
- 0
mindnlp/core/ops/array.py View File

@@ -1,6 +1,7 @@
"""array op"""
import mindspore
from mindspore import ops
from mindspore.ops._primitive_cache import _get_cache_prim

from mindnlp.configs import USE_PYBOOST, GENERATOR_SEED

@@ -59,8 +60,20 @@ def gather_nd(input, indices):
def hstack(tensors):
return ops.hstack(tensors)


# index_fill
def index_fill(input, dim, index, value):
return ops.index_fill(input, dim, index, value)

# index_add
def index_add(input, dim, index, source, *, alpha=1):
if USE_PYBOOST:
return mindspore.ops.auto_generate.gen_ops_prim.index_add_ext_op(input, index, source, dim, alpha)
return ops.index_add(input, index, source, dim)

def inplace_index_add(input, dim, index, source):
_inplace = _get_cache_prim(ops.InplaceIndexAdd)(dim)
return _inplace(input, index, source)

# index_copy

@@ -160,6 +173,10 @@ def scatter_add(input, dim, index, src):
def scatter_nd_update(input, indices, update):
return ops.scatter_nd_update(input, indices, update)


def scatter_update(input, indices, updates):
return ops.scatter_update(input, indices, updates)

# split
def split(tensor, split_size_or_sections, dim=0):
if USE_PYBOOST:


+ 7
- 3
mindnlp/core/ops/creation.py View File

@@ -38,11 +38,13 @@ def zeros(*size, dtype=None):
if USE_PYBOOST:
return mindspore.mint.zeros(size, dtype=dtype)
if dtype is None:
dtype = mindspore.float32
dtype = get_default_dtype()
return _zeros(size, dtype)

# zeros_like
def zeros_like(input, *, dtype=None):
if dtype is None:
dtype = get_default_dtype()
if USE_PYBOOST:
return mindspore.mint.zeros_like(input, dtype=dtype)
return ops.zeros_like(input, dtype=dtype)
@@ -52,14 +54,16 @@ _ones = ops.Ones()
def ones(*size, dtype=None):
if isinstance(size[0], (tuple, list)):
size = size[0]
if dtype is None:
dtype = get_default_dtype()
if USE_PYBOOST:
return mindspore.mint.ones(size, dtype=dtype)
if dtype is None:
dtype = mindspore.float32
return _ones(size, dtype)

# ones_like
def ones_like(input, *, dtype=None):
if dtype is None:
dtype = get_default_dtype()
if USE_PYBOOST:
return mindspore.mint.ones_like(input, dtype=dtype)
return ops.ones_like(input, dtype=dtype)


+ 3
- 0
mindnlp/core/ops/random.py View File

@@ -6,6 +6,7 @@ from mindnlp.configs import USE_PYBOOST, DEVICE_TARGET
from .other import cumsum, searchsorted
from .comparison import topk
from .pointwise import div, log
from ..utils import get_default_dtype

# bernoulli
def bernoulli(input, p=0.5):
@@ -53,6 +54,8 @@ def normal(mean=0.0, std=1.0, size=None):
def rand(*size, dtype=None):
if size[0] == []:
size = ()
if dtype is None:
dtype = get_default_dtype()
if USE_PYBOOST:
return mindspore.mint.rand(*size, dtype=dtype)
return ops.rand(*size, dtype=dtype)


+ 3
- 0
mindnlp/core/optim/__init__.py View File

@@ -1,3 +1,6 @@
"""optimizers"""
from .optimizer import Optimizer
from .sgd import SGD
from .adam import Adam
from .adamw import AdamW
from .lr_scheduler import *

+ 2
- 2
mindnlp/core/optim/adam.py View File

@@ -82,11 +82,11 @@ class Adam(Optimizer):
start = 0
for group in self.param_groups:
end = start + len(group['params'])
amsgrad = group['amsgrad']
maximize = group["maximize"]
for (p, grad) in zip(group['params'], grads[start: end]):
grad = grad if not maximize else -grad
start = end
amsgrad = group['amsgrad']
maximize=group["maximize"]

state = self.state[p]



+ 2
- 2
mindnlp/core/optim/adamw.py View File

@@ -81,11 +81,11 @@ class AdamW(Optimizer):
start = 0
for group in self.param_groups:
end = start + len(group['params'])
amsgrad = group['amsgrad']
maximize = group["maximize"]
for (p, grad) in zip(group['params'], grads[start: end]):
grad = grad if not maximize else -grad
start = end
amsgrad = group['amsgrad']
maximize=group["maximize"]

state = self.state[p]



+ 2166
- 0
mindnlp/core/optim/lr_scheduler.py
File diff suppressed because it is too large
View File


+ 2
- 2
mindnlp/core/optim/sgd.py View File

@@ -60,9 +60,7 @@ class SGD(Optimizer):
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = Tensor(group['momentum'], mindspore.float32)
stat = Tensor(1, mindspore.float32)
lr = Tensor(group['lr'], mindspore.float32)
accum = Tensor(0, mindspore.float32)
dampening = group['dampening']
nesterov = group['nesterov']
maximize=group["maximize"]
@@ -87,6 +85,8 @@ class SGD(Optimizer):
# d_p = buf
# new_p = p.add(d_p, alpha=-group['lr'])
# assign(p, new_p)
stat = ops.ones_like(p)
accum = ops.zeros_like(p)
ops.optim.raw_sgd(p, d_p, lr, dampening, weight_decay, nesterov, accum, momentum, stat)

return loss

+ 95
- 8
mindnlp/core/serialization.py View File

@@ -32,7 +32,6 @@ from enum import Enum
from typing import Dict, Union, Optional, Any, OrderedDict
from functools import reduce
from dataclasses import dataclass
from ml_dtypes import bfloat16

import numpy as np
import mindspore
@@ -42,10 +41,16 @@ from mindspore.train.serialization import _exec_save, _parse_ckpt_proto, tensor_
import safetensors
import safetensors.numpy

from mindnlp.configs import USE_PYBOOST
from mindnlp.configs import SUPPORT_BF16
from .nn import Module
from ..utils import logging


if SUPPORT_BF16:
from mindspore.common.np_dtype import bfloat16 # pylint: disable=import-error
else:
from ml_dtypes import bfloat16

logger = logging.get_logger(__name__)

MAGIC_NUMBER = 0x1950a86a20f9469cfc6c
@@ -210,6 +215,23 @@ the file.
return self.file.getinfo(filename).header_offset
return None

class PyTorchFileWriter:
def __init__(self, file):
self.zipfile = zipfile.ZipFile(file, mode='w')
self.written_records = set()

def write_record(self, name, data, offset=0):
if name in self.written_records:
raise RuntimeError(f"Record {name} already written")
self.written_records.add(name)
self.zipfile.writestr(name, data)

def write_end_of_file(self):
pass

def get_all_written_records(self):
return self.written_records

class LoadEndianness(Enum):

"""
@@ -695,6 +717,44 @@ class _open_zipfile_reader(_opener):
"""
super().__init__(PyTorchFileReader(name_or_buffer))

class _open_zipfile_writer_file(_opener):
def __init__(self, name):
self.file_stream = None
self.name = str(name)
try:
self.name.encode('ascii')
except UnicodeEncodeError:
self.file_stream = io.FileIO(self.name, mode='w')
super().__init__(PyTorchFileWriter(self.file_stream))
else:
super().__init__(PyTorchFileWriter(self.name))

def __exit__(self, *args):
self.file_like.write_end_of_file()
if self.file_stream is not None:
self.file_stream.close()

class _open_zipfile_writer_buffer(_opener):
def __init__(self, buffer):
if not callable(getattr(buffer, "write", None)):
msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'"
if not hasattr(buffer, "write"):
raise AttributeError(msg)
raise TypeError(msg)
self.buffer = buffer
super().__init__(PyTorchFileWriter(buffer))

def __exit__(self, *args):
self.file_like.write_end_of_file()
self.buffer.flush()

def _open_zipfile_writer(name_or_buffer):
if _is_path(name_or_buffer):
container = _open_zipfile_writer_file
else:
container = _open_zipfile_writer_buffer
return container(name_or_buffer)

def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None):
'''Rebuilds a tensor based on the provided parameters.
@@ -719,7 +779,7 @@ def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, bac
num_elemets = reduce(operator.mul, size)
array = storage[storage_offset: storage_offset + num_elemets]

if array.dtype == bfloat16:
if array.dtype == bfloat16 and not SUPPORT_BF16:
logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
array = array.astype(np.float16)

@@ -1051,7 +1111,7 @@ def _legacy_load(f, pickle_module, **pickle_load_args):
else:
order = "C"
array = array.reshape(size, order=order)
if array.dtype == bfloat16:
if array.dtype == bfloat16 and not SUPPORT_BF16:
logger.warning_once("MindSpore do not support bfloat16 dtype, we will automaticlly convert to float16")
array = array.astype(np.float16)
new_result[k] = mindspore.Parameter(array, requires_grad=v.requires_grad)
@@ -1205,6 +1265,33 @@ def convert_torch_to_mindspore(pth_file):

return ms_ckpt_path

def _check_save_filelike(f):
if not isinstance(f, (str, os.PathLike)) and not hasattr(f, 'write'):
raise AttributeError(
"expected 'f' to be string, path, or a file-like object with "
"a 'write' attribute")

def save(obj, f, pickle_module = pickle, pickle_protocol = 2):
_check_save_filelike(f)
with _open_zipfile_writer(f) as opened_zipfile:
_save(obj, opened_zipfile, pickle_module, pickle_protocol)

def _save(obj, zip_file, pickle_module, pickle_protocol):
serialized_storages = {}

data_buf = io.BytesIO()
pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol)
pickler.dump(obj)
data_value = data_buf.getvalue()
zip_file.write_record('archive/data.pkl', data_value, len(data_value))

for key in sorted(serialized_storages.keys()):
name = f'archive/data/{key}'
storage = serialized_storages[key]
storage_data = storage.inner_data
zip_file.write_record(name, storage_data)


def safe_load_file(filename):
"""
This function safely loads a file containing state dictionary data and converts it into a dictionary of MindSpore Parameters.
@@ -1225,12 +1312,12 @@ def safe_load_file(filename):
break

state_dict = safetensors.numpy.load_file(filename)
if USE_PYBOOST or dtype != bfloat16:
if (not SUPPORT_BF16 and dtype != bfloat16) or SUPPORT_BF16:
out_states = {k: mindspore.Parameter(v) for k, v in state_dict.items()}
return out_states
else:
out_states = {k: mindspore.Parameter(v.astype(np.float16)) for k, v in state_dict.items()}
return out_states
out_states = {k: mindspore.Parameter(v.astype(np.float16)) for k, v in state_dict.items()}
return out_states


def safe_save_file(tensor_dict, filename, metadata=None):


+ 23
- 23
mindnlp/engine/trainer/base.py View File

@@ -36,12 +36,9 @@ import numpy as np

import mindspore
from mindspore.dataset import Dataset, BatchDataset, PaddedBatchDataset
import mindspore.experimental
import mindspore.experimental.optim
from mindspore.nn.learning_rate_schedule import LearningRateSchedule

from mindnlp.core import nn, ops, optim
from ...core.serialization import safe_load_file, safe_save_file
from ...core.serialization import safe_load_file, safe_save_file, save, save_checkpoint
from ...peft import PeftModel
from ...configs import WEIGHTS_NAME, CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME, \
WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
@@ -72,7 +69,6 @@ from ..utils import (
get_parameter_names,
get_model_param_count,
speed_metrics,
convert_tensor_to_scalar,
nested_concat,
nested_numpify,
neftune_post_forward_hook,
@@ -139,7 +135,7 @@ class Trainer:
model_init: Optional[Callable[[], PreTrainedModel]] = None,
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
callbacks: Optional[List[TrainerCallback]] = None,
optimizers: Tuple[optim.Optimizer, LearningRateSchedule] = (None, None),
optimizers: Tuple[optim.Optimizer, optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: Optional[Callable[[mindspore.Tensor, mindspore.Tensor], mindspore.Tensor]] = None,
):
"""
@@ -463,13 +459,13 @@ class Trainer:
optimizer_grouped_parameters = [
{
"params": [
p for p in opt_model.trainable_params() if (p.name in decay_parameters and p.requires_grad)
p for p in opt_model.parameters() if (p.name in decay_parameters and p.requires_grad)
],
"weight_decay": self.args.weight_decay,
},
{
"params": [
p for p in opt_model.trainable_params() if (p.name not in decay_parameters and p.requires_grad)
p for p in opt_model.parameters() if (p.name not in decay_parameters and p.requires_grad)
],
"weight_decay": 0.0,
},
@@ -527,12 +523,10 @@ class Trainer:
# optimizer_cls = AdamW
# optimizer_kwargs.update(adam_kwargs)
if args.optim == OptimizerNames.ADAMW:
from mindnlp.core.optim import AdamW

optimizer_cls = AdamW
optimizer_cls = optim.AdamW
optimizer_kwargs.update(adam_kwargs)
elif args.optim == OptimizerNames.SGD:
optimizer_cls = mindspore.experimental.optim.SGD
optimizer_cls = optim.SGD
# TODO: support Adagrad and Rmsporp
# elif args.optim == OptimizerNames.ADAGRAD:
# optimizer_cls = mindspore.nn.Adagrad
@@ -1137,10 +1131,19 @@ MindSpore's `load_checkpoint` function.
# Gradient clipping
if args.max_grad_norm is not None and args.max_grad_norm > 0:
# deepspeed does its own clipping
grads = ops.clip_by_global_norm(grads, args.max_grad_norm)

_grad_norm = nn.utils.clip_grad_norm_(
grads,
args.max_grad_norm,
)
# Optimizer step
self.optimizer(grads)
self.optimizer.step(grads)


optimizer_was_run = True
if optimizer_was_run:
# Delay optimizer scheduling until metrics are generated
if not isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau):
self.lr_scheduler.step()

self.state.global_step += 1
self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
@@ -1376,7 +1379,7 @@ indicating whether to prefer safe tensors.
return self.compute_loss(model, inputs)

if getattr(self, 'grad_fn', None) is None or self.model_reload:
self.grad_fn = mindspore.value_and_grad(forward, None, self.optimizer.parameters)
self.grad_fn = mindspore.value_and_grad(forward, None, tuple(model.parameters()))

loss, grads = self.grad_fn(inputs)

@@ -1527,7 +1530,7 @@ indicating whether to prefer safe tensors.
# self._report_to_hp_search(trial, self.state.global_step, metrics)

# Run delayed LR scheduler now that metrics are populated
if isinstance(self.lr_scheduler, mindspore.experimental.optim.lr_scheduler.ReduceLROnPlateau):
if isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau):
metric_to_check = self.args.metric_for_best_model
if not metric_to_check.startswith("eval_"):
metric_to_check = f"eval_{metric_to_check}"
@@ -1588,7 +1591,7 @@ indicating whether to prefer safe tensors.
state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "np"}
)
else:
mindspore.save_checkpoint(self.model, os.path.join(output_dir, WEIGHTS_NAME))
save_checkpoint(self.model, os.path.join(output_dir, WEIGHTS_NAME))
else:
self.model.save_pretrained(
output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
@@ -1616,15 +1619,12 @@ indicating whether to prefer safe tensors.
"""
if self.args.should_save:
# deepspeed.save_checkpoint above saves model/optim/sched
mindspore.save_checkpoint(self.optimizer, os.path.join(output_dir, OPTIMIZER_NAME))
save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))

# Save SCHEDULER & SCALER
if self.args.should_save:
# with warnings.catch_warnings(record=True) as caught_warnings:
lr_scheduler_state_dict = copy.deepcopy(self.lr_scheduler.state_dict())
with open(os.path.join(output_dir, SCHEDULER_NAME), 'w') as fp:
json.dump(convert_tensor_to_scalar(lr_scheduler_state_dict), fp)
# reissue_pt_warnings(caught_warnings)
save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))

def _save_checkpoint(self, model, metrics=None):
r"""


+ 11
- 10
mindnlp/engine/utils.py View File

@@ -25,11 +25,11 @@ from dataclasses import dataclass
from typing import Union, Tuple, Optional, NamedTuple, List, Dict, Any
from collections.abc import Mapping

import mindspore.experimental
import numpy as np
import mindspore
from mindspore import ops

from mindnlp.core import ops, optim
from mindnlp.core.nn import functional as F
from mindnlp.configs import GENERATOR_SEED
from mindnlp.utils import is_mindspore_available, ExplicitEnum

@@ -265,7 +265,7 @@ class LabelSmoother:
logits = logits[..., :-1, :]
labels = labels[..., 1:]

log_probs = -ops.log_softmax(logits, axis=-1)
log_probs = -F.log_softmax(logits, dim=-1)
if labels.ndim == log_probs.ndim - 1:
labels = labels.unsqueeze(-1)

@@ -273,7 +273,7 @@ class LabelSmoother:
# In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
# will ignore them in any case.
labels = ops.clamp(labels, min=0)
nll_loss = log_probs.gather_elements(dim=-1, index=labels)
nll_loss = ops.gather(log_probs, dim=-1, index=labels)
# works for fp16 input tensor too, by internally upcasting it to fp32
smoothed_loss = ops.sum(log_probs, dim=-1, keepdim=True, dtype=mindspore.float32)

@@ -460,14 +460,14 @@ def get_parameter_names(model, forbidden_layer_types):
Returns the names of the model parameters that are not inside a forbidden layer.
"""
result = []
for name, child in model.name_cells().items():
for name, child in model.named_children():
result += [
f"{name}.{n}"
for n in get_parameter_names(child, forbidden_layer_types)
if not isinstance(child, tuple(forbidden_layer_types))
]
# Add model specific parameters (defined with nn.Parameter) since they are not in any child.
result += list(model._params.keys())
result += list(model._parameters.keys())
return result

def get_model_param_count(model, trainable_only=False):
@@ -518,7 +518,7 @@ def _get_learning_rate(self):
Raises:
None.
"""
if isinstance(self.lr_scheduler, mindspore.experimental.optim.lr_scheduler.ReduceLROnPlateau):
if isinstance(self.lr_scheduler, optim.lr_scheduler.ReduceLROnPlateau):
last_lr = self.optimizer.param_groups[0]["lr"]
else:
last_lr = self.lr_scheduler.get_last_lr()[0]
@@ -526,6 +526,7 @@ def _get_learning_rate(self):
last_lr = last_lr.item()
return last_lr


def find_batch_size(tensors):
"""
Find the first dimension of a tensor in a nested list/tuple/dict of tensors.
@@ -599,8 +600,8 @@ def atleast_1d(tensor_or_array: Union[mindspore.Tensor, np.ndarray]):
"""
if isinstance(tensor_or_array, mindspore.Tensor):
if hasattr(mindspore.ops, "atleast_1d"):
tensor_or_array = ops.atleast_1d(tensor_or_array)
if hasattr(F, "atleast_1d"):
tensor_or_array = F.atleast_1d(tensor_or_array)
elif tensor_or_array.ndim < 1:
tensor_or_array = tensor_or_array[None]
else:
@@ -613,7 +614,7 @@ def ms_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
tensor2 = atleast_1d(tensor2)

if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
return ops.cat((tensor1, tensor2), axis=0)
return ops.cat((tensor1, tensor2), dim=0)

# Let's figure out the new shape
new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]


+ 5
- 0
mindnlp/peft/config.py View File

@@ -134,6 +134,11 @@ class PeftConfigMixin():
"""
return False

@property
def is_adaption_prompt(self) -> bool:
"""Return True if this is an adaption prompt config."""
return False


@dataclass
class PeftConfig(PeftConfigMixin):


+ 0
- 1
mindnlp/peft/mapping.py View File

@@ -59,7 +59,6 @@ MODEL_TYPE_TO_PEFT_MODEL_MAPPING = {


PEFT_TYPE_TO_CONFIG_MAPPING = {
# "ADAPTION_PROMPT": AdaptionPromptConfig,
"PROMPT_TUNING": PromptTuningConfig,
"PREFIX_TUNING": PrefixTuningConfig,
"P_TUNING": PromptEncoderConfig,


+ 7
- 8
mindnlp/peft/tuners/lora/layer.py View File

@@ -20,7 +20,6 @@ from typing import Any, Optional, Union

import mindspore
from mindspore import Parameter
from mindspore.common.initializer import HeUniform, Normal
from mindnlp.core import nn, ops
from mindnlp.core.nn import ParameterDict, functional as F
from ....transformers.ms_utils import Conv1D
@@ -113,7 +112,7 @@ scaling the layer's parameters, as well as performing mixed batch forward operat

base_layer = self.get_base_layer()
if isinstance(base_layer, nn.Linear):
in_features, out_features = base_layer.in_channels, base_layer.out_channels
in_features, out_features = base_layer.in_features, base_layer.out_features
elif isinstance(base_layer, nn.Conv2d):
in_features, out_features = base_layer.in_channels, base_layer.out_channels
elif isinstance(base_layer, nn.Embedding):
@@ -199,7 +198,7 @@ scaling the layer's parameters, as well as performing mixed batch forward operat
if weight is not None:
# the layer is already completely initialized, this is an update
if ops.is_floating_point(weight) or ops.is_complex(weight):
for param in self.get_parameters():
for param in self.parameters():
param.set_data(param.astype(weight.dtype))
break

@@ -236,16 +235,16 @@ scaling the layer's parameters, as well as performing mixed batch forward operat
if init_lora_weights is True:
# initialize A the same way as the default for nn.Linear and B to zero
# https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
self.lora_A[adapter_name].weight.initialize(HeUniform(math.sqrt(5)))
nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
elif init_lora_weights.lower() == "gaussian":
self.lora_A[adapter_name].weight.initialize(Normal(1 / self.r[adapter_name]))
nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
else:
raise ValueError(f"Unknown initialization {init_lora_weights}")
self.lora_B[adapter_name].weight.initialize('zeros')
nn.init.zeros_(self.lora_B[adapter_name].weight)
if adapter_name in self.lora_embedding_A.keys():
# initialize a the same way as the default for nn.Linear and b to zero
self.lora_embedding_A[adapter_name].initialize('zeros')
self.lora_embedding_B[adapter_name].initialize(Normal(1.0))
nn.init.zeros_(self.lora_embedding_A[adapter_name])
nn.init.normal_(self.lora_embedding_B[adapter_name])

def _get_weight_norm(self, weight, lora_weight, scaling) -> mindspore.Tensor:
r"""


+ 1
- 1
mindnlp/peft/tuners/lora/model.py View File

@@ -130,7 +130,7 @@ class LoraModel(BaseTuner):
... pad_token_id=tokenizer.eos_token_id,
... use_cache=False,
... device_map={"": rank},
... torch_dtype=torch.float16,
... ms_dtype=torch.float16,
... quantization_config=quantization_config,
... )
>>> model = prepare_model_for_kbit_training(model)


+ 2
- 1
mindnlp/transformers/activations.py View File

@@ -244,7 +244,8 @@ class ReLUSquaredActivation(nn.Module):
Raises:
None.
"""
relu_applied = ops.relu(input)
# relu_applied = ops.relu(input)
relu_applied = nn.functional.relu(input)
squared = ops.square(relu_applied)
return squared



+ 29
- 34
mindnlp/transformers/cache_utils.py View File

@@ -219,8 +219,6 @@ class QuantizedCacheConfig(CacheConfig):
Defaults to 128.
compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
The defualt dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
device (`str`, *optional*, defaults to `"cpu"`):
Device on which to perform computations, should be same as the model's device.
"""

def __init__(
@@ -232,7 +230,6 @@ class QuantizedCacheConfig(CacheConfig):
q_group_size: Optional[int] = 64,
residual_length: Optional[int] = 128,
compute_dtype: Optional[mindspore.dtype.TensorType] = mindspore.float16,
device: Optional[str] = "cpu",
):
super().__init__()
self.backend = backend
@@ -242,7 +239,6 @@ class QuantizedCacheConfig(CacheConfig):
self.q_group_size = q_group_size
self.residual_length = residual_length
self.compute_dtype = compute_dtype
self.device = device

def validate(self):
"""Validates if the arguments passed are correct"""
@@ -444,8 +440,8 @@ class DynamicCache(Cache):
def batch_repeat_interleave(self, repeats: int):
"""Repeat the cache `repeats` times in the batch dimension. Used in contrastive search."""
for layer_idx in range(len(self)):
self.key_cache[layer_idx] = self.key_cache[layer_idx].repeat_interleave(repeats, dim=0)
self.value_cache[layer_idx] = self.value_cache[layer_idx].repeat_interleave(repeats, dim=0)
self.key_cache[layer_idx] = ops.repeat_interleave(self.key_cache[layer_idx], repeats, dim=0)
self.value_cache[layer_idx] = ops.repeat_interleave(self.value_cache[layer_idx], repeats, dim=0)

def batch_select_indices(self, indices: mindspore.Tensor):
"""Only keep the `indices` in the batch dimension of the cache. Used in contrastive search."""
@@ -479,7 +475,6 @@ class QuantizedCache(DynamicCache):
self.axis_key = cache_config.axis_key
self.axis_value = cache_config.axis_value
self.compute_dtype = cache_config.compute_dtype
self.device = cache_config.device

super().__init__()

@@ -509,7 +504,7 @@ class QuantizedCache(DynamicCache):
keys_to_return = ops.cat(keys_to_return, dim=-2)
values_to_return = ops.cat(values_to_return, dim=-2)
if (
self.key_cache[layer_idx].dim() == 4
self.key_cache[layer_idx].ndim == 4
and self.key_cache[layer_idx].shape[-2] + 1 >= self.residual_length
):
self._quantized_key_cache[layer_idx] = self._quantize(keys_to_return, axis=self.axis_key)
@@ -656,7 +651,7 @@ class SinkCache(Cache):
if using_rope and layer_idx == 0:
# BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
# after all RoPE models have a llama-like cache utilization.
if cos.dim() == 2:
if cos.ndim == 2:
self._cos_cache = cos
self._sin_cache = sin
else:
@@ -722,8 +717,6 @@ class StaticCache(Cache):
The maximum batch size with which the model will be used.
max_cache_len (`int`):
The maximum sequence length with which the model will be used.
device (`torch.device`):
The device on which the cache should be initialized. Should be the same as the layer.
dtype (*optional*, defaults to `mindspore.float32`):
The default `dtype` to use when initializing the layer.
"""
@@ -788,20 +781,26 @@ class StaticCache(Cache):
v_out = self.value_cache[layer_idx]

if cache_position is None:
k_out.copy_(key_states)
v_out.copy_(value_states)
# k_out.copy_(key_states)
# v_out.copy_(value_states)
k_out.assign_value(key_states)
v_out.assign_value(value_states)
else:
# Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
# `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
# operation, that avoids copies and uses less memory.
try:
# If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one
k_out = k_out.index_copy(2, cache_position, key_states)
v_out = v_out.index_copy(2, cache_position, value_states)
except NotImplementedError:
# The operator 'aten::index_copy.out' is not currently implemented for the MPS device.
k_out[:, :, cache_position] = key_states
v_out[:, :, cache_position] = value_states
# try:
# # If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one
# # k_out = k_out.index_copy(2, cache_position, key_states)
# # v_out = v_out.index_copy(2, cache_position, value_states)
# except NotImplementedError:
# # The operator 'aten::index_copy.out' is not currently implemented for the MPS device.
# k_out[:, :, cache_position] = key_states
# v_out[:, :, cache_position] = value_states

# use index_add for mindspore since tensor slice is too slow and no implementation of index_copy
k_out = ops.index_add(k_out, 2, cache_position.int(), key_states)
v_out = ops.index_add(v_out, 2, cache_position.int(), value_states)

return k_out, v_out

@@ -810,7 +809,7 @@ class StaticCache(Cache):
# Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
# limit the check to the first batch member and head dimension.
# TODO: deprecate this function in favor of `cache_position`
return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
return (ops.any(self.key_cache[layer_idx][0, 0], dim=-1)).sum()

def get_max_length(self) -> Optional[int]:
"""Returns the maximum sequence length of the cached states."""
@@ -820,8 +819,8 @@ class StaticCache(Cache):
"""Resets the cache values while preserving the objects"""
for layer_idx in range(len(self.key_cache)):
# In-place ops prevent breaking the static address
self.key_cache[layer_idx].zero_()
self.value_cache[layer_idx].zero_()
self.key_cache[layer_idx][...] = 0
self.value_cache[layer_idx][...] = 0


class SlidingWindowCache(StaticCache):
@@ -848,8 +847,6 @@ class SlidingWindowCache(StaticCache):
The maximum batch size with which the model will be used.
max_cache_len (`int`):
The maximum sequence length with which the model will be used.
device (`torch.device`):
The device on which the cache should be initialized. Should be the same as the layer.
dtype (*optional*, defaults to `mindspore.float32`):
The default `dtype` to use when initializing the layer.
"""
@@ -920,8 +917,8 @@ class SlidingWindowCache(StaticCache):
def reset(self):
for layer_idx in range(len(self.key_cache)):
# In-place ops prevent breaking the static address
self.key_cache[layer_idx].zero_()
self.value_cache[layer_idx].zero_()
self.key_cache[layer_idx][...] = 0
self.value_cache[layer_idx][...] = 0


class EncoderDecoderCache(Cache):
@@ -1192,8 +1189,8 @@ class HybridCache(Cache):
"""Resets the cache values while preserving the objects"""
for layer_idx in range(len(self.key_cache)):
# In-place ops prevent breaking the static address
self.key_cache[layer_idx].zero_()
self.value_cache[layer_idx].zero_()
self.key_cache[layer_idx][...] = 0
self.value_cache[layer_idx][...] = 0


class MambaCache:
@@ -1204,7 +1201,6 @@ class MambaCache:
config: MambaConfig
max_batch_size: int
dtype: torch.dtype
device: torch.device

Attributes:
dtype: torch.dtype
@@ -1220,7 +1216,6 @@ class MambaCache:
config: PretrainedConfig,
max_batch_size: int,
dtype: mindspore.dtype.TensorType = mindspore.float16,
device: Optional[str] = None,
**kwargs,
):
self.dtype = dtype
@@ -1254,13 +1249,13 @@ class MambaCache:
cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)

conv_state = conv_state.roll(shifts=-1, dims=-1)
conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
conv_state[:, :, cache_position] = new_conv_state
self.conv_states[layer_idx].zero_()
self.conv_states[layer_idx] += conv_state
return self.conv_states[layer_idx]

def update_ssm_state(self, layer_idx: int, new_ssm_state: mindspore.Tensor):
self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
self.ssm_states[layer_idx] = new_ssm_state
return self.ssm_states[layer_idx]

def reset(self):


+ 4
- 1
mindnlp/transformers/configuration_utils.py View File

@@ -603,6 +603,9 @@ class PretrainedConfig:
original_kwargs = copy.deepcopy(kwargs)
# Get config dict associated with the base config file
config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
if 'torch_dtype' in config_dict:
config_dict['ms_dtype'] = config_dict.pop('torch_dtype')

if "_commit_hash" in config_dict:
original_kwargs["_commit_hash"] = config_dict["_commit_hash"]

@@ -631,7 +634,7 @@ class PretrainedConfig:
from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
commit_hash = kwargs.pop("_commit_hash", None)
mirror = kwargs.pop('mirror', 'huggingface')
mirror = kwargs.get('mirror', 'huggingface')

gguf_file = kwargs.get("gguf_file", None)



+ 1
- 1
mindnlp/transformers/generation/candidate_generator.py View File

@@ -411,7 +411,7 @@ def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_en
if mask_length_diff < 0:
model_kwargs[mask_key] = mask[:, :mask_length_diff]
elif mask_length_diff > 0:
model_kwargs[mask_key] = ops.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1)
model_kwargs[mask_key] = ops.cat([mask, ops.ones((mask.shape[0], mask_length_diff), dtype=mask.dtype)], dim=-1)
return model_kwargs




+ 7
- 7
mindnlp/transformers/generation/configuration_utils.py View File

@@ -1032,17 +1032,17 @@ class GenerationConfig:
else:
return config

def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
def dict_ms_dtype_to_str(self, d: Dict[str, Any]) -> None:
"""
Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
Checks whether the passed dictionary and its nested dicts have a *ms_dtype* key and if it's not None,
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
string, which can then be stored in the json format.
"""
if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
if d.get("ms_dtype", None) is not None and not isinstance(d["ms_dtype"], str):
d["ms_dtype"] = str(d["ms_dtype"]).split(".")[1]
for value in d.values():
if isinstance(value, dict):
self.dict_torch_dtype_to_str(value)
self.dict_ms_dtype_to_str(value)

def to_diff_dict(self) -> Dict[str, Any]:
"""
@@ -1064,7 +1064,7 @@ class GenerationConfig:
if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
serializable_config_dict[key] = value

self.dict_torch_dtype_to_str(serializable_config_dict)
self.dict_ms_dtype_to_str(serializable_config_dict)
return serializable_config_dict

def to_dict(self) -> Dict[str, Any]:
@@ -1082,7 +1082,7 @@ class GenerationConfig:
if "_original_object_hash" in output:
del output["_original_object_hash"]

self.dict_torch_dtype_to_str(output)
self.dict_ms_dtype_to_str(output)
return output

def to_json_string(self, use_diff: bool = True, ignore_metadata: bool = False) -> str:


+ 17
- 4
mindnlp/transformers/generation/logits_process.py View File

@@ -139,7 +139,7 @@ class MinLengthLogitsProcessor(LogitsProcessor):
eos_token_mask = ops.isin(vocab_tensor, self.eos_token_id)
scores_processed = scores
if input_ids.shape[-1] < self.min_length:
scores_processed = ops.where(eos_token_mask, -math.inf, scores)
scores_processed = ops.where(eos_token_mask, float(ops.finfo(scores.dtype).min), scores)
return scores_processed


@@ -206,7 +206,7 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
vocab_tensor = ops.arange(scores.shape[-1])
eos_token_mask = ops.isin(vocab_tensor, self.eos_token_id)
if new_tokens_length < self.min_new_tokens:
scores_processed = ops.where(eos_token_mask, -math.inf, scores)
scores_processed = ops.where(eos_token_mask, float(ops.finfo(scores.dtype).min), scores)

return scores_processed

@@ -435,6 +435,8 @@ class TopPLogitsWarper(LogitsWarper):
self.min_tokens_to_keep = min_tokens_to_keep

def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)
sorted_logits, sorted_indices = ops.sort(scores, descending=False)
cumulative_probs = ops.cumsum(ops.softmax(sorted_logits, dim=-1), dim=-1)

@@ -495,6 +497,9 @@ class TopKLogitsWarper(LogitsWarper):


def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)

top_k = min(self.top_k, scores.shape[-1]) # Safety check
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = scores < ops.topk(scores, top_k)[0][..., -1, None]
@@ -560,6 +565,8 @@ class MinPLogitsWarper(LogitsWarper):
self.min_tokens_to_keep = min_tokens_to_keep

def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)
# Convert logits to probabilities
probs = ops.softmax(scores, dim=-1)
# Get the probability of the top token for each sequence in the batch
@@ -645,6 +652,8 @@ class TypicalLogitsWarper(LogitsWarper):


def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)
# calculate entropy
normalized = nn.functional.log_softmax(scores, dim=-1)
p = ops.exp(normalized)
@@ -653,7 +662,7 @@ class TypicalLogitsWarper(LogitsWarper):
# shift and sort
shifted_scores = ops.abs((-normalized) - ent)
sorted_scores, sorted_indices = ops.sort(shifted_scores, descending=False)
sorted_logits = scores.gather(-1, sorted_indices)
sorted_logits = ops.gather(scores, -1, sorted_indices)
cumulative_probs = ops.cumsum(ops.softmax(sorted_logits, dim=-1), dim=-1)

# Remove tokens with cumulative mass above the threshold
@@ -724,6 +733,8 @@ class EpsilonLogitsWarper(LogitsWarper):


def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)
# Determine which indices to remove
probabilities = ops.softmax(scores, dim=-1)
indices_to_remove = probabilities < self.epsilon
@@ -805,6 +816,8 @@ class EtaLogitsWarper(LogitsWarper):


def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if self.filter_value == -float("Inf"):
self.filter_value = float(ops.finfo(scores.dtype).min)
probabilities = ops.softmax(scores, dim=-1)
entropy = ops.distributions.Categorical(logits=scores).entropy()
eta = ops.min(self.epsilon, ops.sqrt(self.epsilon) * ops.exp(-entropy))[..., None]
@@ -1552,7 +1565,7 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
cur_len = input_ids.shape[-1]
scores_processed = scores
if cur_len == self.max_length - 1:
scores_processed = ops.full_like(scores, -math.inf, dtype=scores.dtype)
scores_processed = ops.full_like(scores, float(ops.finfo(scores.dtype).min), dtype=scores.dtype)
scores_processed[:, self.eos_token_id] = 0
return scores_processed



+ 1
- 1
mindnlp/transformers/generation/stopping_criteria.py View File

@@ -478,7 +478,7 @@ class StoppingCriteriaList(list):
def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor, **kwargs) -> mindspore.Tensor:
is_done = ops.full((input_ids.shape[0],), False, dtype=mindspore.bool_)
for criteria in self:
is_done = is_done | criteria(input_ids, scores, **kwargs)
is_done = (is_done.int() | criteria(input_ids, scores, **kwargs).int()).bool()
return is_done

@property


+ 31
- 8
mindnlp/transformers/generation/utils.py View File

@@ -484,10 +484,10 @@ class GenerationMixin:

is_pad_token_in_inputs = (pad_token_id is not None) and (
ops.isin(elements=inputs, test_elements=pad_token_id).any()
)
).item()
is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
ops.isin(elements=eos_token_id, test_elements=pad_token_id).any()
)
).item()
can_infer_attention_mask = is_pad_token_in_inputs & is_pad_token_not_equal_to_eos_token_id
attention_mask_from_padding = inputs.ne(pad_token_id).long()

@@ -665,14 +665,14 @@ class GenerationMixin:
if "attention_mask" in model_kwargs:
attention_mask = model_kwargs["attention_mask"]
model_kwargs["attention_mask"] = ops.cat(
[attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
[attention_mask, ops.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype)], dim=-1
)
else:
# update decoder attention mask
if "decoder_attention_mask" in model_kwargs:
decoder_attention_mask = model_kwargs["decoder_attention_mask"]
model_kwargs["decoder_attention_mask"] = ops.cat(
[decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
[decoder_attention_mask, ops.ones((decoder_attention_mask.shape[0], 1), dtype=decoder_attention_mask.dtype)],
dim=-1,
)

@@ -681,7 +681,7 @@ class GenerationMixin:
else:
past_positions = model_kwargs.pop("cache_position")
new_positions = ops.arange(
past_positions[-1] + 1, past_positions[-1] + num_new_tokens + 1, dtype=past_positions.dtype
past_positions[-1].item() + 1, past_positions[-1].item() + num_new_tokens + 1, dtype=past_positions.dtype
)
model_kwargs["cache_position"] = ops.cat((past_positions, new_positions))
return model_kwargs
@@ -1384,9 +1384,9 @@ class GenerationMixin:
"""Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
# `torch.compile`-friendly `ops.arange` from a shape -- the lines below are equivalent to `ops.arange`
if "inputs_embeds" in model_kwargs:
cache_position = ops.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=mindspore.int64).cumsum(0) - 1
cache_position = ops.cumsum(ops.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=mindspore.int32), 0) - 1
else:
cache_position = ops.ones_like(input_ids[0, :], dtype=mindspore.int64).cumsum(0) - 1
cache_position = ops.cumsum(ops.ones_like(input_ids[0, :], dtype=mindspore.int32), 0) - 1

past_length = 0
if model_kwargs.get("past_key_values") is not None:
@@ -3225,6 +3225,29 @@ class GenerationMixin:
next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
)

def replace_negative_indices(next_tokens):
next_tokens_np = next_tokens.asnumpy()

used_indices = set(next_tokens_np[next_tokens_np != -1].flatten())
min_unused = 0

result = []
for token_row in next_tokens_np:
new_row = []
for token in token_row:
if token == -1:
while min_unused in used_indices:
min_unused += 1
new_row.append(min_unused)
used_indices.add(min_unused)
else:
new_row.append(token)
result.append(new_row)

return mindspore.Tensor(np.array(result, dtype=next_tokens_np.dtype))

next_tokens = replace_negative_indices(next_tokens)

next_indices = ops.div(next_tokens, vocab_size, rounding_mode="floor")
next_tokens = next_tokens % vocab_size

@@ -3953,7 +3976,7 @@ class GenerationMixin:
candidate_kwargs["cache_position"] = ops.cat(
(
candidate_kwargs["cache_position"],
ops.arange(cur_len, cur_len + candidate_length, dtype=mindspore.int64),
ops.arange(cur_len, cur_len + candidate_length, dtype=candidate_kwargs["cache_position"].dtype),
),
dim=0,
)


+ 540
- 0
mindnlp/transformers/modeling_rope_utils.py View File

@@ -0,0 +1,540 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""rope utils"""
import math
from typing import Optional, Tuple

from .configuration_utils import PretrainedConfig
from ..utils import is_mindspore_available, logging


logger = logging.get_logger(__name__)


if is_mindspore_available():
import mindspore
from mindnlp.core import ops


def _compute_default_rope_parameters(
config: Optional[PretrainedConfig] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies according to the original RoPE implementation
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)

attention_factor = 1.0 # Unused in this type of RoPE

# Compute the inverse frequencies
inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim))
return inv_freq, attention_factor


def _compute_linear_scaling_rope_parameters(
config: Optional[PretrainedConfig] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
factor = rope_kwargs["factor"]
elif config is not None:
factor = config.rope_scaling["factor"]

# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len, **rope_kwargs)

# Then applies linear scaling to the frequencies.
# NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
# applying scaling to the inverse frequencies is equivalent.
inv_freq /= factor
return inv_freq, attention_factor


def _compute_dynamic_ntk_parameters(
config: Optional[PretrainedConfig] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length, used to update the dynamic RoPE at inference time.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
max_position_embeddings = rope_kwargs["max_position_embeddings"]
factor = rope_kwargs["factor"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]

attention_factor = 1.0 # Unused in this type of RoPE

# seq_len: default to max_position_embeddings, e.g. at init time
seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings

# Compute the inverse frequencies
base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) # pylint: disable=used-before-assignment
inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim))
return inv_freq, attention_factor


def _compute_yarn_parameters(
config: PretrainedConfig, seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Please refer to the
[original paper](https://arxiv.org/abs/2309.00071)
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# No need to keep BC with yarn, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
)

base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]

# Sets the attention factor as suggested in the paper
attention_factor = config.rope_scaling.get("attention_factor")
if attention_factor is None:
attention_factor = 0.1 * math.log(factor) + 1.0

# Optional config options
# beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
beta_fast = config.rope_scaling.get("beta_fast") or 32
beta_slow = config.rope_scaling.get("beta_slow") or 1

# Compute the inverse frequencies
def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
"""Inverse dimension formula to find the dimension based on the number of rotations"""
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))

def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
"""Find dimension range bounds based on rotations"""
low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
return max(low, 0), min(high, dim - 1)

def linear_ramp_factor(min, max, dim):
if min == max:
max += 0.001 # Prevent singularity

linear_func = (ops.arange(dim, dtype=mindspore.float32) - min) / (max - min)
ramp_func = ops.clamp(linear_func, 0, 1)
return ramp_func

# Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
# to expand the possible context length. In other words, interpolation = apply scaling factor.
pos_freqs = base ** (ops.arange(0, dim, 2).float() / dim)
inv_freq_extrapolation = 1.0 / pos_freqs
inv_freq_interpolation = 1.0 / (factor * pos_freqs)

low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)

# Get n-dimensional rotational scaling corrected for extrapolation
inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float()
inv_freq = (
inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+ inv_freq_extrapolation * inv_freq_extrapolation_factor
)

return inv_freq, attention_factor


def _compute_longrope_parameters(
config: PretrainedConfig, seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
[original implementation](https://github.com/microsoft/LongRoPE)
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
# No need to keep BC with longrope, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
f"{rope_kwargs}"
)

base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
long_factor = config.rope_scaling["long_factor"]
short_factor = config.rope_scaling["short_factor"]
factor = config.rope_scaling.get("factor")
attention_factor = config.rope_scaling.get("attention_factor")

# NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
# values to compute the default attention scaling factor, instead of using `factor`.
if hasattr(config, "original_max_position_embeddings"):
max_position_embeddings = config.original_max_position_embeddings
expanded_max_position_embeddings = config.max_position_embeddings
factor = expanded_max_position_embeddings / max_position_embeddings
else:
max_position_embeddings = config.max_position_embeddings
expanded_max_position_embeddings = max_position_embeddings * factor

# Sets the attention factor as suggested in the paper
if attention_factor is None:
if factor <= 1.0:
attention_factor = 1.0
else:
attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))

# Compute the inverse frequencies -- scaled based on the target sequence length
if expanded_max_position_embeddings > max_position_embeddings:
ext_factors = mindspore.tensor(long_factor, dtype=mindspore.float32)
else:
ext_factors = mindspore.tensor(short_factor, dtype=mindspore.float32)
inv_freq_shape = ops.arange(0, dim, 2, dtype=mindspore.int64).float() / dim
inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)

return inv_freq, attention_factor


def _compute_llama3_parameters(
config: PretrainedConfig, seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["mindspore.Tensor", float]:
"""
Computes the inverse frequencies for llama 3.1.

Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`mindspore.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, seq_len, **rope_kwargs)

factor = config.rope_scaling["factor"] # `8` in the original implementation
low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation

low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

wavelen = 2 * math.pi / inv_freq
# wavelen < high_freq_wavelen: do nothing
# wavelen > low_freq_wavelen: divide by factor
inv_freq_llama = ops.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
# otherwise: interpolate between the two, using a smooth factor
smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
inv_freq_llama = ops.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)

return inv_freq_llama, attention_factor


# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
# parameterizations, as long as the callable has the same signature.
ROPE_INIT_FUNCTIONS = {
"default": _compute_default_rope_parameters,
"linear": _compute_linear_scaling_rope_parameters,
"dynamic": _compute_dynamic_ntk_parameters,
"yarn": _compute_yarn_parameters,
"longrope": _compute_longrope_parameters,
"llama3": _compute_llama3_parameters,
}


def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
"""Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
# BC: "rope_type" was originally "type" -- let's gracefully handle it
if "rope_type" not in received_keys and "type" in received_keys:
received_keys -= {"type"}
received_keys.add("rope_type")

missing_keys = required_keys - received_keys
if missing_keys:
raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")

if optional_keys is not None:
unused_keys = received_keys - required_keys - optional_keys
else:
unused_keys = received_keys - required_keys
if unused_keys:
logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")


def _validate_default_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)


def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)

factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
optional_keys = {"original_max_position_embeddings"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)

factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")


def _validate_yarn_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)

factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

attention_factor = rope_scaling.get("attention_factor")
if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
logger.warning(
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
)
beta_fast = rope_scaling.get("beta_fast")
if beta_fast is not None and not isinstance(beta_fast, float):
logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
beta_slow = rope_scaling.get("beta_slow")
if beta_slow is not None and not isinstance(beta_slow, float):
logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")

if (beta_fast or 32) < (beta_slow or 1):
logger.warning(
f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
)


def _validate_longrope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "short_factor", "long_factor"}
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)

partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)

short_factor = rope_scaling.get("short_factor")
if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
if not len(short_factor) == dim // 2:
logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")

long_factor = rope_scaling.get("long_factor")
if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
if not len(long_factor) == dim // 2:
logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")

# Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
# `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
# unique to longrope (= undesirable)
if hasattr(config, "original_max_position_embeddings"):
logger.warning_once(
"This model has set a `original_max_position_embeddings` field, to be used together with "
"`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
"with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
"as it is compatible with most model architectures."
)
else:
factor = rope_scaling.get("factor")
if factor is None:
logger.warning("Missing required keys in `rope_scaling`: 'factor'")
elif not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

attention_factor = rope_scaling.get("attention_factor")
if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0:
logger.warning(
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
)


def _validate_llama3_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)

factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")

low_freq_factor = rope_scaling["low_freq_factor"]
high_freq_factor = rope_scaling["high_freq_factor"]
if low_freq_factor is None or not isinstance(low_freq_factor, float):
logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
if high_freq_factor is None or not isinstance(high_freq_factor, float):
logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
if high_freq_factor <= low_freq_factor:
logger.warning(
"`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
)

original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
logger.warning(
"`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
f"{original_max_position_embeddings}"
)
if original_max_position_embeddings >= config.max_position_embeddings:
logger.warning(
"`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
)


# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
ROPE_VALIDATION_FUNCTIONS = {
"default": _validate_default_rope_parameters,
"linear": _validate_linear_scaling_rope_parameters,
"dynamic": _validate_dynamic_scaling_rope_parameters,
"yarn": _validate_yarn_parameters,
"longrope": _validate_longrope_parameters,
"llama3": _validate_llama3_parameters,
}


def rope_config_validation(config: PretrainedConfig):
"""
Validate the RoPE config arguments, given a `PretrainedConfig` object
"""
rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig`
if rope_scaling is None:
return

# BC: "rope_type" was originally "type"
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
if validation_fn is not None:
validation_fn(config)
else:
logger.warning(
f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
)

+ 24
- 9
mindnlp/transformers/modeling_utils.py View File

@@ -570,10 +570,10 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix, assign_
for key in state_dict.keys():
new_key = None
if "gamma" in key:
logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
# logger.warning(PARAM_RENAME_WARNING.format("gamma", "weight"))
new_key = key.replace("gamma", "weight")
if "beta" in key:
logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
# logger.warning(PARAM_RENAME_WARNING.format("beta", "bias"))
new_key = key.replace("beta", "bias")
if new_key:
old_keys.append(key)
@@ -817,7 +817,7 @@ class ModuleUtilsMixin:
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * ops.finfo(dtype).min
extended_attention_mask = (1.0 - extended_attention_mask) * float(ops.finfo(dtype).min)
return extended_attention_mask

def get_head_mask(
@@ -1193,7 +1193,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
`torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
"""
if not isinstance(dtype, typing.Float):
if not isinstance(dtype, (typing.Float, typing.BFloat)):
raise ValueError(
f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype"
)
@@ -2431,7 +2431,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
output_loading_info = kwargs.pop("output_loading_info", False)
_ = kwargs.pop("mirror", None)
from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
_fast_init = kwargs.pop("_fast_init", True)
@@ -2592,18 +2591,33 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
elif not use_safetensors and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
):
# Load from a PyTorch checkpoint
# Load from a MindSpore checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
)
elif not use_safetensors and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant))
):
# Load from a sharded PyTorch checkpoint
# Load from a sharded MindSpore checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
)
is_sharded = True
elif not use_safetensors and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(PT_WEIGHTS_NAME, variant))
):
# Load from a PyTorch checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(PT_WEIGHTS_NAME, variant)
)
elif not use_safetensors and os.path.isfile(
os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(PT_WEIGHTS_INDEX_NAME, variant))
):
# Load from a sharded PyTorch checkpoint
archive_file = os.path.join(
pretrained_model_name_or_path, subfolder, _add_variant(PT_WEIGHTS_INDEX_NAME, variant)
)
is_sharded = True
elif use_safetensors:
raise EnvironmentError(
f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
@@ -2611,7 +2625,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
)
else:
raise EnvironmentError(
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(PT_WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
f" found in directory {pretrained_model_name_or_path}."
)
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
@@ -2858,7 +2872,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
dtype_orig = cls._set_default_ms_dtype(ms_dtype)

# Check if `_keep_in_fp32_modules` is not None
use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (ms_dtype == mindspore.float16)
use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (ms_dtype == mindspore.float16)

if is_sharded:
loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
@@ -2875,6 +2889,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PeftAdapterM
config, use_flash_attention_2=use_flash_attention_2, ms_dtype=ms_dtype, device_map=device_map
)

model_kwargs.pop('mirror', None)
with ContextManagers(init_contexts):
# Let's make sure we don't run the init function of buffer modules
model = cls(config, *model_args, **model_kwargs)


+ 27
- 1
mindnlp/transformers/models/__init__.py View File

@@ -88,10 +88,13 @@ from . import (
chatglm,
chatglm2,
chatglm3,
chatglm4,
flava,
florence2,
focalnet,
fnet,
funnel,
fsmt,
gemma,
git,
gpt,
@@ -108,6 +111,7 @@ from . import (
imagegpt,
instructblip,
ibert,
idefics,
jamba,
jetmoe,
kosmos2,
@@ -164,6 +168,8 @@ from . import (
qdqbert,
qwen2,
qwen2_moe,
rag,
realm,
reformer,
rembert,
resnet,
@@ -183,6 +189,7 @@ from . import (
splinter,
squeezebert,
starcoder2,
superpoint,
swiftformer,
swin,
switch_transformers,
@@ -193,6 +200,7 @@ from . import (
timesformer,
tinybert,
trocr,
tvlt,
udop,
upernet,
umt5,
@@ -223,8 +231,9 @@ from . import (
xmod,
vilt,
yolos,
fuyu,
)
from .fuyu import *
from .owlv2 import *
from .albert import *
from .align import *
@@ -257,6 +266,7 @@ from .bros import *
from .byt5 import *
from .camembert import *
from .canine import *
from .chatglm4 import *
from .clap import *
from .clip import *
from .clipseg import *
@@ -296,9 +306,11 @@ from .chatglm2 import *
from .chatglm3 import *
from .falcon import *
from .flava import *
from .florence2 import *
from .focalnet import *
from .fnet import *
from .funnel import *
from .fsmt import *
from .fastspeech2_conformer import *
from .gemma import *
from .git import *
@@ -313,6 +325,7 @@ from .gpt2 import *
from .graphormer import *
from .groupvit import *
from .ibert import *
from .idefics import *
from .hubert import *
from .imagegpt import *
from .instructblip import *
@@ -372,6 +385,8 @@ from .pop2piano import *
from .qdqbert import *
from .qwen2 import *
from .qwen2_moe import *
from .rag import *
from .realm import *
from .reformer import *
from .rembert import *
from .resnet import *
@@ -391,6 +406,7 @@ from .stablelm import *
from .splinter import *
from .squeezebert import *
from .starcoder2 import *
from .superpoint import *
from .swiftformer import *
from .swin import *
from .switch_transformers import *
@@ -401,6 +417,7 @@ from .tapas import *
from .time_series_transformer import *
from .timesformer import *
from .trocr import *
from .tvlt import *
from .udop import *
from .upernet import *
from .unispeech_sat import *
@@ -501,11 +518,14 @@ __all__.extend(ernie.__all__)
__all__.extend(ernie_m.__all__)
__all__.extend(esm.__all__)
__all__.extend(chatglm.__all__)
__all__.extend(chatglm4.__all__)
__all__.extend(falcon.__all__)
__all__.extend(flava.__all__)
__all__.extend(florence2.__all__)
__all__.extend(fnet.__all__)
__all__.extend(focalnet.__all__)
__all__.extend(funnel.__all__)
__all__.extend(fsmt.__all__)
__all__.extend(fastspeech2_conformer.__all__)
__all__.extend(chatglm2.__all__)
__all__.extend(chatglm3.__all__)
@@ -523,6 +543,7 @@ __all__.extend(graphormer.__all__)
__all__.extend(groupvit.__all__)
__all__.extend(hubert.__all__)
__all__.extend(ibert.__all__)
__all__.extend(idefics.__all__)
__all__.extend(imagegpt.__all__)
__all__.extend(instructblip.__all__)
__all__.extend(jamba.__all__)
@@ -581,6 +602,8 @@ __all__.extend(pop2piano.__all__)
__all__.extend(qdqbert.__all__)
__all__.extend(qwen2.__all__)
__all__.extend(qwen2_moe.__all__)
__all__.extend(rag.__all__)
__all__.extend(realm.__all__)
__all__.extend(reformer.__all__)
__all__.extend(rembert.__all__)
__all__.extend(resnet.__all__)
@@ -605,11 +628,13 @@ __all__.extend(owlv2.__all__)
__all__.extend(swin.__all__)
__all__.extend(switch_transformers.__all__)
__all__.extend(swin2sr.__all__)
__all__.extend(superpoint.__all__)
__all__.extend(t5.__all__)
__all__.extend(time_series_transformer.__all__)
__all__.extend(timesformer.__all__)
__all__.extend(tinybert.__all__)
__all__.extend(trocr.__all__)
__all__.extend(tvlt.__all__)
__all__.extend(udop.__all__)
__all__.extend(upernet.__all__)
__all__.extend(unispeech_sat.__all__)
@@ -639,4 +664,5 @@ __all__.extend(xlm_prophetnet.__all__)
__all__.extend(xlnet.__all__)
__all__.extend(umt5.__all__)
__all__.extend(xmod.__all__)
__all__.extend(fuyu.__all__)
__all__.extend(yolos.__all__)

+ 6
- 6
mindnlp/transformers/models/auto/auto_factory.py View File

@@ -475,10 +475,10 @@ class _BaseAutoModelClass:

if not isinstance(config, PretrainedConfig):
kwargs_orig = copy.deepcopy(kwargs)
# ensure not to pollute the config object with torch_dtype="auto" - since it's
# ensure not to pollute the config object with ms_dtype="auto" - since it's
# meaningless in the context of the config object - torch.dtype values are acceptable
if kwargs.get("torch_dtype", None) == "auto":
_ = kwargs.pop("torch_dtype")
if kwargs.get("ms_dtype", None) == "auto":
_ = kwargs.pop("ms_dtype")
# to not overwrite the quantization_config if config has a quantization_config
if kwargs.get("quantization_config", None) is not None:
_ = kwargs.pop("quantization_config")
@@ -492,9 +492,9 @@ class _BaseAutoModelClass:
**kwargs,
)

# if torch_dtype=auto was passed here, ensure to pass it on
if kwargs_orig.get("torch_dtype", None) == "auto":
kwargs["torch_dtype"] = "auto"
# if ms_dtype=auto was passed here, ensure to pass it on
if kwargs_orig.get("ms_dtype", None) == "auto":
kwargs["ms_dtype"] = "auto"
if kwargs_orig.get("quantization_config", None) is not None:
kwargs["quantization_config"] = kwargs_orig["quantization_config"]



+ 36
- 0
mindnlp/transformers/models/auto/configuration_auto.py View File

@@ -55,6 +55,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("camembert", "CamembertConfig"),
("canine", "CanineConfig"),
("chatglm", "ChatGLMConfig"),
("chatglm4", "ChatGLM4Config"),
("clap", "ClapConfig"),
("clip", "CLIPConfig"),
("clipseg", "CLIPSegConfig"),
@@ -80,6 +81,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("dinov2", "Dinov2Config"),
("distilbert", "DistilBertConfig"),
("donut-swin", "DonutSwinConfig"),
("dpr", "DPRConfig"),
("dpt", "DPTConfig"),
("efficientformer", "EfficientFormerConfig"),
("electra", "ElectraConfig"),
@@ -89,10 +91,13 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("esm", "EsmConfig"),
("falcon", "FalconConfig"),
("flava", "FlavaConfig"),
("florence2", "Florence2Config"),
("fnet", "FNetConfig"),
("focalnet", "FocalNetConfig"),
("funnel", "FunnelConfig"),
("fuyu","FuyuConfig"),
("fastspeech2_conformer", "FastSpeech2ConformerConfig"),
("fsmt","FSMTConfig"),
("gemma", "GemmaConfig"),
("git", "GitConfig"),
("gpt2", "GPT2Config"),
@@ -103,6 +108,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("groupvit", "GroupViTConfig"),
("hubert", "HubertConfig"),
("ibert", "IBertConfig"),
("idefics", "IdeficsConfig"),
("instructblip", "InstructBlipConfig"),
("jamba", "JambaConfig"),
("jetmoe", "JetMoEConfig"),
@@ -144,6 +150,8 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("qdqbert", "QDQBertConfig"),
("qwen2", "Qwen2Config"),
("qwen2_moe", "Qwen2MoeConfig"),
("rag", "RagConfig"),
("realm","RealmConfig"),
("reformer", "ReformerConfig"),
("rembert", "RemBertConfig"),
("resnet", "ResNetConfig"),
@@ -168,6 +176,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("time_series_transformer", "TimeSeriesTransformerConfig"),
("timesformer", "TimesformerConfig"),
("trocr", "TrOCRConfig"),
("tvlt","TvltConfig"),
("udop", "UdopConfig"),
("upernet", "UPerNetConfig"),
("umt5", "UMT5Config"),
@@ -268,6 +277,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("falcon", "FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("florence2", "FLORENCE2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("focalnet", "FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -349,6 +359,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("pvt", "PVT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("qdqbert", "QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("qwen2", "QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("rag","RAG_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("realm", "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("regnet", "REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -449,6 +460,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("canine", "CANINE"),
("chinese_clip", "Chinese-CLIP"),
("chatglm", "ChatGLM"),
("chatglm4", "ChatGLM4"),
("clap", "CLAP"),
("clip", "CLIP"),
("clip_vision_model", "CLIPVisionModel"),
@@ -503,6 +515,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("flan-ul2", "FLAN-UL2"),
("flaubert", "FlauBERT"),
("flava", "FLAVA"),
("florence2", "Florence2"),
("fnet", "FNet"),
("focalnet", "FocalNet"),
("fsmt", "FairSeq Machine-Translation"),
@@ -1354,6 +1367,9 @@ class AutoConfig:
config_dict, unused_kwargs = PretrainedConfig.get_config_dict(
pretrained_model_name_or_path, **kwargs
)

fix_chatglm_name(config_dict)

if "model_type" in config_dict:
config_class = CONFIG_MAPPING[config_dict["model_type"]]
return config_class.from_dict(config_dict, **unused_kwargs)
@@ -1385,3 +1401,23 @@ class AutoConfig:
"match!"
)
CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok)


def fix_chatglm_name(config):
model_type = config.get('model_type', None)
if model_type is None or model_type != 'chatglm':
return
if 'glm-4' in config['_name_or_path']:
config['model_type'] = 'chatglm4'
new_model_name = 'ChatGLM4'
elif 'chatglm3' in config['_name_or_path']:
config['model_type'] = 'chatglm3'
new_model_name = 'ChatGLM3'
elif 'chatglm2' in config['_name_or_path']:
config['model_type'] = 'chatglm2'
new_model_name = 'ChatGLM2'

for arch in config['architectures']:
arch.replace('ChatGLM', new_model_name)
for k, v in config['auto_map'].items():
config['auto_map'][k] = v.replace('ChatGLM', new_model_name)

+ 3
- 3
mindnlp/transformers/models/auto/image_processing_auto.py View File

@@ -66,11 +66,11 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("efficientnet", "EfficientNetImageProcessor"),
("flava", "FlavaImageProcessor"),
("focalnet", "BitImageProcessor"),
# ("fuyu", "FuyuImageProcessor"),
("fuyu", "FuyuImageProcessor"),
("git", "CLIPImageProcessor"),
# ("glpn", "GLPNImageProcessor"),
("groupvit", "CLIPImageProcessor"),
# ("idefics", "IdeficsImageProcessor"),
("idefics", ("IdeficsImageProcessor",)),
("imagegpt", "ImageGPTImageProcessor"),
# ("instructblip", "BlipImageProcessor"),
("kosmos-2", "CLIPImageProcessor"),
@@ -107,7 +107,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
# ("swinv2", "ViTImageProcessor"),
("table-transformer", "DetrImageProcessor"),
("timesformer", "VideoMAEImageProcessor"),
# ("tvlt", "TvltImageProcessor"),
("tvlt", "TvltImageProcessor"),
# ("tvp", "TvpImageProcessor"),
("udop", "LayoutLMv3ImageProcessor"),
# ("upernet", "SegformerImageProcessor"),


+ 6
- 0
mindnlp/transformers/models/auto/modeling_auto.py View File

@@ -58,6 +58,8 @@ MODEL_MAPPING_NAMES = OrderedDict(
("chameleon", "ChameleonModel"),
("chinese_clip", "ChineseCLIPModel"),
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
("chatglm", "ChatGLMModel"),
("chatglm4", "ChatGLM4Model"),
("clap", "ClapModel"),
("clip", "CLIPModel"),
("clip_vision_model", "CLIPVisionModel"),
@@ -105,6 +107,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("focalnet", "FocalNetModel"),
("fsmt", "FSMTModel"),
("funnel", ("FunnelModel", "FunnelBaseModel")),
("fsmt","FSMTModel"),
("gemma", "GemmaModel"),
("gemma2", "Gemma2Model"),
("git", "GitModel"),
@@ -446,6 +449,8 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("blenderbot-small", "BlenderbotSmallForCausalLM"),
("bloom", "BloomForCausalLM"),
("camembert", "CamembertForCausalLM"),
('chatglm', 'ChatGLMForConditionalGeneration'),
('chatglm4', 'ChatGLM4ForConditionalGeneration'),
("code_llama", "LlamaForCausalLM"),
("codegen", "CodeGenForCausalLM"),
("cohere", "CohereForCausalLM"),
@@ -456,6 +461,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("electra", "ElectraForCausalLM"),
("ernie", "ErnieForCausalLM"),
("falcon", "FalconForCausalLM"),
("florence2", "Florence2ForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("gemma", "GemmaForCausalLM"),
("gemma2", "Gemma2ForCausalLM"),


+ 1
- 0
mindnlp/transformers/models/auto/processing_auto.py View File

@@ -57,6 +57,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("clvp", "ClvpProcessor"),
("donut", "DonutProcessor"),
("flava", "FlavaProcessor"),
("florence2", "Florence2Processor"),
("fuyu", "FuyuProcessor"),
("git", "GitProcessor"),
("groupvit", "CLIPProcessor"),


+ 2
- 2
mindnlp/transformers/models/auto/tokenization_auto.py View File

@@ -215,7 +215,7 @@ else:
("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
("hubert", ("Wav2Vec2CTCTokenizer", None)),
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
# ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)),
# ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
@@ -570,7 +570,7 @@ def tokenizer_class_from_name(class_name: str):

# We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# init and we return the proper dummy to get an appropriate error message.
main_module = importlib.import_module("transformers")
main_module = importlib.import_module("mindnlp.transformers")
if hasattr(main_module, class_name):
return getattr(main_module, class_name)



+ 3
- 3
mindnlp/transformers/models/blip_2/modeling_blip_2.py View File

@@ -2372,7 +2372,7 @@ class Blip2Model(Blip2PreTrainedModel):
...
...
>>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
>>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", ms_dtype=torch.float16)
...
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
@@ -2713,7 +2713,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
...
>>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
>>> model = Blip2ForConditionalGeneration.from_pretrained(
... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, torch_dtype=torch.float16
... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, ms_dtype=torch.float16
... ) # doctest: +IGNORE_RESULT
...
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -2748,7 +2748,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):

```python
>>> model = Blip2ForConditionalGeneration.from_pretrained(
... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, torch_dtype=torch.bfloat16
... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, ms_dtype=torch.bfloat16
... ) # doctest: +IGNORE_RESULT
...
>>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(dtype=torch.bfloat16)


+ 10
- 0
mindnlp/transformers/models/chatglm4/__init__.py View File

@@ -0,0 +1,10 @@
"""chatglm4 model"""
from . import configuration_chatglm4, modeling_chatglm4, tokenization_chatglm4
from .configuration_chatglm4 import *
from .tokenization_chatglm4 import *
from .modeling_chatglm4 import *

__all__ = []
__all__.extend(configuration_chatglm4.__all__)
__all__.extend(tokenization_chatglm4.__all__)
__all__.extend(modeling_chatglm4.__all__)

+ 61
- 0
mindnlp/transformers/models/chatglm4/configuration_chatglm4.py View File

@@ -0,0 +1,61 @@
"""configuration chatglm4"""
from ...configuration_utils import PretrainedConfig


class ChatGLM4Config(PretrainedConfig):
model_type = "chatglm"

def __init__(
self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
classifier_dropout=None,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
rope_ratio=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
**kwargs
):
self.num_layers = num_layers
self.vocab_size = padded_vocab_size
self.padded_vocab_size = padded_vocab_size
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.kv_channels = kv_channels
self.num_attention_heads = num_attention_heads
self.seq_length = seq_length
self.hidden_dropout = hidden_dropout
self.classifier_dropout = classifier_dropout
self.attention_dropout = attention_dropout
self.layernorm_epsilon = layernorm_epsilon
self.rmsnorm = rmsnorm
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
self.post_layer_norm = post_layer_norm
self.add_bias_linear = add_bias_linear
self.add_qkv_bias = add_qkv_bias
self.bias_dropout_fusion = bias_dropout_fusion
self.multi_query_attention = multi_query_attention
self.multi_query_group_num = multi_query_group_num
self.rope_ratio = rope_ratio
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
self.fp32_residual_connection = fp32_residual_connection
super().__init__(**kwargs)

__all__ = ['ChatGLM4Config']

+ 1017
- 0
mindnlp/transformers/models/chatglm4/modeling_chatglm4.py View File

@@ -0,0 +1,1017 @@
""" MindSpore ChatGLM4 model. """

import math
from typing import Optional, Tuple, Union, List, Dict, Any

import mindspore
from mindnlp.core import nn, ops
import mindnlp.core.nn.functional as F
from mindnlp.core.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
from mindnlp.configs import USE_PYBOOST
from ...modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
SequenceClassifierOutputWithPast,
)
from ...modeling_utils import PreTrainedModel
from ....utils import logging
from ...generation.logits_process import LogitsProcessor
from ...generation.utils import ModelOutput

from .configuration_chatglm4 import ChatGLM4Config


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM4"
_CONFIG_FOR_DOC = "ChatGLM4Config"


def default_init(cls, *args, **kwargs):
return cls(*args, **kwargs)


class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
if ops.isnan(scores).any() or ops.isinf(scores).any():
scores = ops.zeros_like(scores, dtype=scores.dtype)
scores[..., 198] = 5e4
return scores


def split_tensor_along_last_dim(
tensor: mindspore.Tensor,
num_partitions: int,
contiguous_split_chunks: bool = False,
) -> List[mindspore.Tensor]:
"""Split a tensor along its last dimension.

Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.

Returns:
A list of Tensors
"""
# Get the size and dimension.
last_dim = tensor.ndim - 1
last_dim_size = tensor.shape[last_dim] // num_partitions
# Split.
tensor_list = ops.split(tensor, last_dim_size, dim=last_dim)
# Note: ops.split does not create contiguous tensors by default.
if contiguous_split_chunks:
return tuple(chunk for chunk in tensor_list)

return tensor_list


class RotaryEmbedding(nn.Module):
def __init__(self, dim, rope_ratio=1, original_impl=False, dtype=None):
super().__init__()
inv_freq = 1.0 / (10000 ** (ops.arange(0, dim, 2).to(dtype=dtype) / dim))
self.register_buffer("inv_freq", inv_freq)
self.dim = dim
self.original_impl = original_impl
self.rope_ratio = rope_ratio

def forward_impl(
self, seq_len: int, n_elem: int, dtype: mindspore.dtype, base: int = 10000
):
"""Enhanced Transformer with Rotary Position Embedding.

Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
transformers/rope/__init__.py. MIT License:
https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
"""
# $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
base = base * self.rope_ratio
theta = 1.0 / (base ** (ops.arange(0, n_elem, 2, dtype=mindspore.float32) / n_elem))

# Create position indexes `[0, 1, ..., seq_len - 1]`
seq_idx = ops.arange(seq_len, dtype=mindspore.float32)

# Calculate the product of position index and $\theta_i$
idx_theta = ops.outer(seq_idx, theta).float()

cache = ops.stack([ops.cos(idx_theta), ops.sin(idx_theta)], dim=-1)

# this is to mimic the behaviour of complex32, else we will get different results
if dtype in (mindspore.float16, mindspore.bfloat16, mindspore.int8):
cache = cache.bfloat16() if dtype == mindspore.bfloat16 else cache.half()
return cache

def forward(self, max_seq_len, offset=0):
return self.forward_impl(
max_seq_len, self.dim, dtype=self.inv_freq.dtype
)


def apply_rotary_pos_emb(x: mindspore.Tensor, rope_cache: mindspore.Tensor) -> mindspore.Tensor:
# x: [b, np, sq, hn]
b, np, sq, hn = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
rot_dim = rope_cache.shape[-2] * 2
# x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
x, x_pass = ops.split(x, rot_dim, -1)
# truncate to support variable sizes
# rope_cache = rope_cache[:, :sq]
rope_cache = ops.narrow(rope_cache, 1, 0, sq)
xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
xshaped_0, xshaped_1 = ops.split(xshaped, 1, -1)
rope_cache_0, rope_cache_1 = ops.split(rope_cache, 1, -1)
rope_cache = rope_cache.view(-1, 1, sq, xshaped.shape[3], 2)
x_out2 = ops.stack(
[
xshaped_0 * rope_cache_0 - xshaped_1 * rope_cache_1,
xshaped_1 * rope_cache_0 + xshaped_0 * rope_cache_1,
],
-1,
)
x_out2 = ops.flatten(x_out2, 3)
return ops.cat((x_out2, x_pass), dim=-1)


class RMSNorm(nn.Module):
def __init__(self, normalized_shape, eps=1e-5, dtype=None, **kwargs):
super().__init__()
self.weight = nn.Parameter(ops.empty(normalized_shape, dtype=dtype))
self.eps = eps

def forward(self, hidden_states: mindspore.Tensor):
if not self.training and USE_PYBOOST:
return F.rms_norm(hidden_states, self.weight, self.eps)
input_dtype = hidden_states.dtype
variance = ops.mean(hidden_states.to(mindspore.float32).pow(2), -1, keepdim=True)
hidden_states = hidden_states * ops.rsqrt(variance + self.eps)

return (self.weight * hidden_states).to(input_dtype)


class CoreAttention(nn.Module):
def __init__(self, config: ChatGLM4Config, layer_number):
super(CoreAttention, self).__init__()
self.config = config
self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
self.layer_number = max(1, layer_number)
self.is_causal = True

projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.
self.hidden_size_per_partition = projection_size
self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
self.num_attention_heads_per_partition = config.num_attention_heads

coeff = None
self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
if self.apply_query_key_layer_scaling:
coeff = self.layer_number
self.norm_factor *= coeff
self.coeff = coeff

self.attention_dropout = nn.Dropout(config.attention_dropout)

def forward(self, query_layer, key_layer, value_layer, attention_mask):
# [b, np, sq, sk]
output_size = (query_layer.shape[0], query_layer.shape[1], query_layer.shape[2], key_layer.shape[2])

# [b, np, sq, hn] -> [b * np, sq, hn]
query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
# [b, np, sk, hn] -> [b * np, sk, hn]
key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)

# preallocting input tensor: [b * np, sq, sk]
matmul_input_buffer = ops.zeros(
output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
)

# Raw attention scores. [b * np, sq, sk]
matmul_result = ops.baddbmm(
matmul_input_buffer,
query_layer, # [b * np, sq, hn]
ops.transpose(key_layer, 1, 2), # [b * np, hn, sk]
beta=0.0,
alpha=(1.0 / self.norm_factor),
)

# change view to [b, np, sq, sk]
attention_scores = matmul_result.view(*output_size)

# ===========================
# Attention probs and dropout
# ===========================

# attention scores and attention mask [b, np, sq, sk]
if self.attention_softmax_in_fp32:
attention_scores = attention_scores.float()
if self.coeff is not None:
attention_scores = attention_scores * self.coeff
if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
attention_mask = ops.ones(output_size[0], 1, output_size[2], output_size[3], dtype=mindspore.int32)
attention_mask = attention_mask.tril().bool()
attention_mask = ~attention_mask
if attention_mask is not None:
attention_scores = attention_scores.masked_fill(attention_mask, float(ops.finfo(attention_scores.dtype).min))
attention_probs = F.softmax(attention_scores, dim=-1)
attention_probs = attention_probs.type_as(value_layer)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.attention_dropout(attention_probs)

# query layer shape: [b * np, sq, hn]
# value layer shape: [b, np, sk, hn]
# attention shape: [b, np, sq, sk]
# context layer shape: [b, np, sq, hn]
output_size = (value_layer.shape[0], value_layer.shape[1], query_layer.shape[1], value_layer.shape[3])
# change view [b * np, sk, hn]
value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.shape[2], -1)
# change view [b * np, sq, sk]
attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
# matmul: [b * np, sq, hn]
context_layer = ops.bmm(attention_probs, value_layer)
# change view [b, np, sq, hn]
context_layer = context_layer.view(*output_size)
# [b, np, sq, hn] --> [b, sq, np, hn]
context_layer = ops.transpose(context_layer, 1, 2)
# [b, sq, np, hn] --> [b, sq, hp]
new_context_layer_shape = context_layer.shape[:-2] + (self.hidden_size_per_partition,)
context_layer = context_layer.reshape(*new_context_layer_shape)

return context_layer



def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=mindspore.int32)
indices = ops.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(ops.cumsum(seqlens_in_batch, dim=0, dtype=mindspore.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)


CORE_ATTENTION_CLASSES = {
"eager": CoreAttention,
}


class SelfAttention(nn.Module):
"""Parallel self-attention layer abstract class.

Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
"""

def __init__(self, config: ChatGLM4Config, layer_number):
super(SelfAttention, self).__init__()
self.layer_number = max(1, layer_number)

self.projection_size = config.kv_channels * config.num_attention_heads

# Per attention head and per partition values.
self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
self.num_attention_heads_per_partition = config.num_attention_heads

self.multi_query_attention = config.multi_query_attention
self.qkv_hidden_size = 3 * self.projection_size
if self.multi_query_attention:
self.num_multi_query_groups_per_partition = config.multi_query_group_num
self.qkv_hidden_size = (
self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
)
self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
bias=config.add_bias_linear or config.add_qkv_bias,
**_config_to_kwargs(config)
)

self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)

# Output.
self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
**_config_to_kwargs(config)
)

def _allocate_memory(self, inference_max_sequence_len, batch_size, dtype=None):
if self.multi_query_attention:
num_attention_heads = self.num_multi_query_groups_per_partition
else:
num_attention_heads = self.num_attention_heads_per_partition
return ops.empty(
inference_max_sequence_len,
batch_size,
num_attention_heads,
self.hidden_size_per_attention_head,
dtype=dtype,
)

def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
):
# hidden_states: [b, sq, h]

# =================================================
# Pre-allocate memory for key-values for inference.
# =================================================
# =====================
# Query, Key, and Value
# =====================

# Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]
mixed_x_layer = self.query_key_value(hidden_states)

if self.multi_query_attention:
(query_layer, key_layer, value_layer) = ops.split(
mixed_x_layer,
[
self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
],
dim=-1,
)
query_layer = query_layer.view(
query_layer.shape[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
)
key_layer = key_layer.view(
key_layer.shape[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
)
value_layer = value_layer.view(
value_layer.shape[:-1]
+ (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
)
else:
new_tensor_shape = mixed_x_layer.shape[:-1] + \
(self.num_attention_heads_per_partition,
3 * self.hidden_size_per_attention_head)
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

# [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
(query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)

# [b, sq, np, hn] -> [b, np, sq, hn]
query_layer, key_layer, value_layer = [ops.transpose(k, 1, 2) for k in [query_layer, key_layer, value_layer]]

# apply relative positional encoding (rotary embedding)
if rotary_pos_emb is not None:
query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)

# adjust key and value for inference
if kv_cache is not None:
cache_k, cache_v = kv_cache
key_layer = ops.cat((cache_k, key_layer), dim=2)
value_layer = ops.cat((cache_v, value_layer), dim=2)
if use_cache:
if kv_cache is None:
kv_cache = ops.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
dim=1)
else:
kv_cache = (key_layer, value_layer)
else:
kv_cache = None

if self.multi_query_attention:
key_layer = key_layer.unsqueeze(2)
key_layer = key_layer.broadcast_to(
(-1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1)
)
key_layer = key_layer.view(
key_layer.shape[:1] + (self.num_attention_heads_per_partition,) + key_layer.shape[3:]
)
value_layer = value_layer.unsqueeze(2)
value_layer = value_layer.broadcast_to(
(-1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1)
)
value_layer = value_layer.view(
value_layer.shape[:1] + (self.num_attention_heads_per_partition,) + value_layer.shape[3:]
)

# ==================================
# core attention computation
# ==================================

context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)

# =================
# Output. [sq, b, h]
# =================

output = self.dense(context_layer)

return output, kv_cache


def _config_to_kwargs(args):
common_kwargs = {
"dtype": args.ms_dtype,
}
return common_kwargs


class MLP(nn.Module):
"""MLP.

MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
"""

def __init__(self, config: ChatGLM4Config):
super(MLP, self).__init__()

self.add_bias = config.add_bias_linear

# Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
self.dense_h_to_4h = nn.Linear(
config.hidden_size,
config.ffn_hidden_size * 2,
bias=self.add_bias,
**_config_to_kwargs(config)
)

def swiglu(x):
x = ops.chunk(x, 2, dim=-1)
return F.silu(x[0]) * x[1]

self.activation_func = swiglu

# Project back to h.
self.dense_4h_to_h = nn.Linear(
config.ffn_hidden_size,
config.hidden_size,
bias=self.add_bias,
**_config_to_kwargs(config)
)

def forward(self, hidden_states):
# [s, b, 4hp]
intermediate_parallel = self.dense_h_to_4h(hidden_states)
intermediate_parallel = self.activation_func(intermediate_parallel)
# [s, b, h]
output = self.dense_4h_to_h(intermediate_parallel)
return output


class GLMBlock(nn.Module):
"""A single transformer layer.

Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
"""

def __init__(self, config: ChatGLM4Config, layer_number):
super(GLMBlock, self).__init__()
self.layer_number = layer_number

self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm

self.fp32_residual_connection = config.fp32_residual_connection

LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
# Layernorm on the input data.
self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon,
dtype=config.ms_dtype)

# Self attention.
self.self_attention = SelfAttention(config, layer_number)
self.hidden_dropout = config.hidden_dropout

# Layernorm on the attention output
self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon,
dtype=config.ms_dtype)

# MLP
self.mlp = MLP(config)

def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
):
# hidden_states: [s, b, h]

# Layer norm at the beginning of the transformer layer.
layernorm_output = self.input_layernorm(hidden_states)
# Self attention.
attention_output, kv_cache = self.self_attention(
layernorm_output,
attention_mask,
rotary_pos_emb,
kv_cache=kv_cache,
use_cache=use_cache
)

# Residual connection.
if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states

layernorm_input = nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
layernorm_input = residual + layernorm_input

# Layer norm post the self attention.
layernorm_output = self.post_attention_layernorm(layernorm_input)

# MLP.
mlp_output = self.mlp(layernorm_output)

# Second residual connection.
if self.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = layernorm_input

output = nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
output = residual + output

return output, kv_cache


class GLMTransformer(nn.Module):
"""Transformer class."""

def __init__(self, config: ChatGLM4Config):
super(GLMTransformer, self).__init__()

self.fp32_residual_connection = config.fp32_residual_connection
self.post_layer_norm = config.post_layer_norm

# Number of layers.
self.num_layers = config.num_layers

# Transformer layers.
def build_layer(layer_number):
return GLMBlock(config, layer_number)

self.layers = nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])

if self.post_layer_norm:
LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
# Final layer norm before output.
self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon,
dtype=config.ms_dtype)

self.gradient_checkpointing = False

def _get_layer(self, layer_number):
return self.layers[layer_number]

def forward(
self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
use_cache: Optional[bool] = True,
output_hidden_states: Optional[bool] = False,
):
if kv_caches is None:
kv_caches = [None for _ in range(self.num_layers)]
presents = () if use_cache else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False

all_self_attentions = None
all_hidden_states = () if output_hidden_states else None
for index in range(self.num_layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

layer = self._get_layer(index)
if self.gradient_checkpointing and self.training:
# layer_ret = checkpoint.checkpoint(
# layer,
# hidden_states,
# attention_mask,
# rotary_pos_emb,
# kv_caches[index],
# use_cache,
# use_reentrant=False
# )
pass
else:
layer_ret = layer(
hidden_states,
attention_mask,
rotary_pos_emb,
kv_cache=kv_caches[index],
use_cache=use_cache
)
hidden_states, kv_cache = layer_ret
if use_cache:
# token by token decoding, use tuple format
if kv_caches[0] is not None:
presents = presents + (kv_cache,)
# prefilling in decoding, use tensor format to save cuda memory
else:
if len(presents) == 0:
presents = kv_cache
else:
presents = ops.cat((presents, kv_cache), dim=0)

if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

# Final layer norm.
if self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)

return hidden_states, presents, all_hidden_states, all_self_attentions


class ChatGLM4PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""

is_parallelizable = False
supports_gradient_checkpointing = True
config_class = ChatGLM4Config
base_model_prefix = "transformer"
_no_split_modules = ["GLMBlock"]
_supports_flash_attn_2 = True
_supports_sdpa = True

def _init_weights(self, module: nn.Module):
"""Initialize the weights."""
return

def get_masks(self, input_ids, past_key_values, padding_mask=None):
if self.config._attn_implementation == "flash_attention_2":
if padding_mask is not None and not padding_mask.all():
return padding_mask
return None
batch_size, seq_length = input_ids.shape
full_attention_mask = ops.ones(batch_size, seq_length, seq_length)
full_attention_mask = full_attention_mask.tril()
past_length = 0
if past_key_values:
past_length = past_key_values[0][0].shape[2]
if past_length:
full_attention_mask = ops.cat((ops.ones(batch_size, seq_length, past_length), full_attention_mask), dim=-1)
if padding_mask is not None:
full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
if not past_length and padding_mask is not None:
full_attention_mask -= padding_mask.unsqueeze(-1) - 1
full_attention_mask = (full_attention_mask < 0.5).bool()
full_attention_mask.unsqueeze_(1)
return full_attention_mask

def get_position_ids(self, input_ids):
batch_size, seq_length = input_ids.shape
position_ids = ops.arange(seq_length, dtype=mindspore.int64).unsqueeze(0).repeat(batch_size, 1)
return position_ids

class Embedding(nn.Module):
"""Language model embeddings."""

def __init__(self, config: ChatGLM4Config):
super(Embedding, self).__init__()

self.hidden_size = config.hidden_size
# Word embeddings (parallel).
self.word_embeddings = nn.Embedding(
config.padded_vocab_size,
self.hidden_size,
dtype=config.ms_dtype,
)
self.fp32_residual_connection = config.fp32_residual_connection

def forward(self, input_ids):
# Embeddings.
words_embeddings = self.word_embeddings(input_ids)
embeddings = words_embeddings
# If the input flag for fp32 residual connection is set, convert for float.
if self.fp32_residual_connection:
embeddings = embeddings.float()
return embeddings


class ChatGLM4Model(ChatGLM4PreTrainedModel):
def __init__(self, config: ChatGLM4Config, empty_init=True):
super().__init__(config)
init_method = default_init
init_kwargs = {}
self.embedding = init_method(Embedding, config, **init_kwargs)
self.num_layers = config.num_layers
self.multi_query_group_num = config.multi_query_group_num
self.kv_channels = config.kv_channels

# Rotary positional embeddings
self.seq_length = config.seq_length
rotary_dim = (
config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
)

self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
original_impl=config.original_rope,
dtype=config.ms_dtype)
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
dtype=config.ms_dtype, **init_kwargs)

def get_input_embeddings(self):
return self.embedding.word_embeddings

def set_input_embeddings(self, value):
self.embedding.word_embeddings = value

def forward(
self,
input_ids,
position_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
full_attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor, mindspore.Tensor], ...]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

batch_size, seq_length = input_ids.shape

if inputs_embeds is None:
inputs_embeds = self.embedding(input_ids)

if full_attention_mask is None:
if (attention_mask is not None and not attention_mask.all()) or (past_key_values is not None and seq_length != 1):
full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)

# Rotary positional embeddings
rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
if position_ids is not None:
rotary_pos_emb = rotary_pos_emb[position_ids]
else:
rotary_pos_emb = rotary_pos_emb[None, :seq_length]

# Run encoder.
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
)
if presents is not None and type(presents) is mindspore.Tensor:
presents = ops.split(presents, 1, dim=0)
presents = list(presents)
presents = [list(ops.split(x.squeeze(0), 1, dim=0)) for x in presents]
presents = [tuple(x.squeeze(0) for x in y) for y in presents]
presents = tuple(presents)

if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)


class ChatGLM4ForConditionalGeneration(ChatGLM4PreTrainedModel):
def __init__(self, config: ChatGLM4Config, empty_init=True):
super().__init__(config)

self.max_sequence_length = config.max_length
self.transformer = ChatGLM4Model(config, empty_init=empty_init)
self.config = config

def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput,
model_kwargs: Dict[str, Any],
is_encoder_decoder: bool = False,
standardize_cache_format: bool = False,
) -> Dict[str, Any]:
# update past_key_values
cache_name, cache = self._extract_past_from_model_output(outputs)
model_kwargs[cache_name] = cache

# update attention mask
if "attention_mask" in model_kwargs:
attention_mask = model_kwargs["attention_mask"]
model_kwargs["attention_mask"] = ops.cat(
[attention_mask, ops.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype)], dim=-1
)

# update position ids
if "position_ids" in model_kwargs:
position_ids = model_kwargs["position_ids"]
new_position_id = position_ids[..., -1:]
new_position_id += 1
model_kwargs["position_ids"] = ops.cat(
[position_ids, new_position_id], dim=-1
)

model_kwargs["is_first_forward"] = False
return model_kwargs

def prepare_inputs_for_generation(
self,
input_ids: mindspore.Tensor,
past_key_values: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
is_first_forward: bool = True,
**kwargs
) -> dict:
# only last token for input_ids if past is not None
if position_ids is None:
position_ids = self.get_position_ids(input_ids)
if not is_first_forward:
if past_key_values is not None:
position_ids = position_ids[..., -1:]
input_ids = input_ids[:, -1:]
return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"position_ids": position_ids,
"attention_mask": attention_mask,
"return_last_logit": True,
"use_cache": use_cache
}

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
labels: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
return_last_logit: Optional[bool] = False,
):
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

transformer_outputs = self.transformer(
input_ids=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

hidden_states = transformer_outputs[0]
if return_last_logit:
hidden_states = hidden_states[:, -1:]
lm_logits = self.transformer.output_layer(hidden_states)

loss = None
if labels is not None:
lm_logits = lm_logits.to(mindspore.float32)

# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :]
shift_labels = labels[..., 1:]
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

lm_logits = lm_logits.to(hidden_states.dtype)
loss = loss.to(hidden_states.dtype)

if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output

return CausalLMOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

@staticmethod
def _reorder_cache(
past: Tuple[Tuple[mindspore.Tensor, mindspore.Tensor], ...], beam_idx: mindspore.Tensor
) -> Tuple[Tuple[mindspore.Tensor, mindspore.Tensor], ...]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.

Output shares the same memory storage as `past`.
"""
return tuple(
(
layer_past[0].index_select(0, beam_idx),
layer_past[1].index_select(0, beam_idx),
)
for layer_past in past
)


class ChatGLM4ForSequenceClassification(ChatGLM4PreTrainedModel):
def __init__(self, config: ChatGLM4Config, empty_init=True):
super().__init__(config)

self.num_labels = config.num_labels
self.transformer = ChatGLM4Model(config, empty_init=empty_init)

self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=config.ms_dtype)
if config.classifier_dropout is not None:
self.dropout = nn.Dropout(config.classifier_dropout)
else:
self.dropout = None
self.config = config

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
full_attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor, mindspore.Tensor], ...]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
labels: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, ...], SequenceClassifierOutputWithPast]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

transformer_outputs = self.transformer(
input_ids=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
full_attention_mask=full_attention_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

hidden_states = transformer_outputs[0]
pooled_hidden_states = hidden_states[:, -1]
if self.dropout is not None:
pooled_hidden_states = self.dropout(pooled_hidden_states)
logits = self.classifier_head(pooled_hidden_states)

loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"

if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze().float(), labels.squeeze())
else:
loss = loss_fct(logits.float(), labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))

if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output

return SequenceClassifierOutputWithPast(
loss=loss,
logits=logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)

__all__ = [
"ChatGLM4Model",
"ChatGLM4ForSequenceClassification",
"ChatGLM4ForConditionalGeneration",
"ChatGLM4PreTrainedModel"
]

+ 233
- 0
mindnlp/transformers/models/chatglm4/tokenization_chatglm4.py View File

@@ -0,0 +1,233 @@
"""tokenization chatglm4"""
import os
import base64
from typing import List, Optional, Union, Dict

import regex as re
from ...tokenization_utils import PreTrainedTokenizer
from ....utils import PaddingStrategy, is_tiktoken_available
from ...tokenization_utils_base import EncodedInput, BatchEncoding

if is_tiktoken_available():
import tiktoken

class ChatGLM4Tokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask", "position_ids"]

def __init__(
self,
vocab_file,
padding_side="left",
clean_up_tokenization_spaces=False,
encode_special_tokens=False,
**kwargs
):
self.name = "GLM4Tokenizer"
self.vocab_file = vocab_file
pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
self.pat_str = re.compile(pat_str)
self.encode_special_tokens = encode_special_tokens

mergeable_ranks = {}
with open(vocab_file) as f:
for line in f:
token, rank = line.strip().split()
rank = int(rank)
token = base64.b64decode(token)
mergeable_ranks[token] = rank

self.mergeable_ranks = mergeable_ranks

self.tokenizer = tiktoken.Encoding(
name="my_tokenizer",
pat_str=pat_str,
mergeable_ranks=mergeable_ranks,
special_tokens={}
)
self.decoder = {rank: token for token, rank in mergeable_ranks.items()}
self.n_words = len(self.decoder)

super().__init__(
padding_side=padding_side,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs
)

@property
def vocab_size(self):
return self.n_words

def get_vocab(self):
""" Returns vocab as a dict """
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab

def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
"""
Converts a sequence of tokens in a single string.
"""
text = ""
temp = b""
for t in tokens:
if isinstance(t, int):
t = chr(t)
if isinstance(t, str):
if temp:
text += temp.decode("utf-8", errors="replace")
elif isinstance(t, bytes):
temp += t
else:
raise TypeError("token should only be of type int, bytes or str")
if temp:
text += temp.decode("utf-8", errors="replace")
return text

def _tokenize(self, text, **kwargs):
tokens = []
ids = self.tokenizer.encode(text)
for t in ids:
tokens.append(self.decoder[t])
return tokens

def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.mergeable_ranks[token]

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, "")

def save_vocabulary(self, save_directory, filename_prefix=None):
"""
Save the vocabulary and special tokens file to a directory.

Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.

Returns:
`Tuple(str)`: Paths to the files saved.
"""
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, self.vocab_files_names["vocab_file"]
)
else:
vocab_file = save_directory

with open(self.vocab_file, 'rb') as fin:
proto_str = fin.read()

with open(vocab_file, "wb") as writer:
writer.write(proto_str)

return (vocab_file,)

def get_prefix_tokens(self):
prefix_tokens = [self.convert_tokens_to_ids("[gMASK]"), self.convert_tokens_to_ids("<sop>")]
return prefix_tokens

def build_single_message(self, role, metadata, message, tokenize=True):
assert role in ["system", "user", "assistant", "observation"], role
if tokenize:
role_tokens = [self.convert_tokens_to_ids(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n",
disallowed_special=())
message_tokens = self.tokenizer.encode(message, disallowed_special=())
tokens = role_tokens + message_tokens
return tokens
else:
return str(f"<|{role}|>{metadata}\n{message}")

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
prefix_tokens = self.get_prefix_tokens()
token_ids_0 = prefix_tokens + token_ids_0
if token_ids_1 is not None:
token_ids_0 = token_ids_0 + token_ids_1 + [self.convert_tokens_to_ids("<eos>")]
return token_ids_0

def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.

- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:

- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
assert self.padding_side == "left"

required_input = encoded_inputs[self.model_input_names[0]]
seq_length = len(required_input)

if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)

if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

# Initialize attention mask if not present.
if "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * seq_length

if "position_ids" not in encoded_inputs:
encoded_inputs["position_ids"] = list(range(seq_length))

if needs_to_be_padded:
difference = max_length - len(required_input)

if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "position_ids" in encoded_inputs:
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input

return encoded_inputs

__all__ = ['ChatGLM4Tokenizer']

+ 29
- 0
mindnlp/transformers/models/florence2/__init__.py View File

@@ -0,0 +1,29 @@
# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
MindNLP Florence Model.
"""

from . import modeling_florence2, configuration_florence2, processing_florence2

from .modeling_florence2 import *
from .configuration_florence2 import *
from .processing_florence2 import *


__all__ = []
__all__.extend(modeling_florence2.__all__)
__all__.extend(configuration_florence2.__all__)
__all__.extend(processing_florence2.__all__)

+ 333
- 0
mindnlp/transformers/models/florence2/configuration_florence2.py View File

@@ -0,0 +1,333 @@
# coding=utf-8
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Florence-2 configuration"""
from mindnlp.transformers.configuration_utils import PretrainedConfig
from mindnlp.utils import logging

logger = logging.get_logger(__name__)


class Florence2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Florence2VisionModel architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
drop_path_rate (`float`, *optional*, defaults to 0.1):
The dropout rate of the drop path layer.
patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
The patch size of the image.
patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
The patch stride of the image.
patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
The patch padding of the image.
patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
Whether to apply layer normalization before the patch embedding layer.
enable_checkpoint (`bool`, *optional*, defaults to False):
Whether to enable checkpointing.
dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
The dimension of the embedding layer.
num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
The number of attention heads.
num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
The number of groups.
depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
The depth of the model.
window_size (`int`, *optional*, defaults to 12):
The window size of the model.
projection_dim (`int`, *optional*, defaults to 1024):
The dimension of the projection layer.
visual_temporal_embedding (`dict`, *optional*):
The configuration of the visual temporal embedding.
image_pos_embed (`dict`, *optional*):
The configuration of the image position embedding.
image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
The source of the image feature.
Example:

```python
>>> from mindnlp.transformers.models.florence2 import Florence2VisionConfig, Florence2VisionModel

>>> # Initializing a Florence2 Vision style configuration
>>> configuration = Florence2VisionConfig()

>>> # Initializing a model (with random weights)
>>> model = Florence2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "florence2_vision"
keys_to_ignore_at_inference = ["past_key_values"]

def __init__(
self,
drop_path_rate=0.1,
patch_size=[7, 3, 3, 3],
patch_stride=[4, 2, 2, 2],
patch_padding=[3, 1, 1, 1],
patch_prenorm=[False, True, True, True],
enable_checkpoint=False,
dim_embed=[256, 512, 1024, 2048],
num_heads=[8, 16, 32, 64],
num_groups=[8, 16, 32, 64],
depths=[1, 1, 9, 1],
window_size=12,
projection_dim=1024,
visual_temporal_embedding=None,
image_pos_embed=None,
image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
**kwargs,
):
self.drop_path_rate = drop_path_rate
self.patch_size = patch_size
self.patch_stride = patch_stride
self.patch_padding = patch_padding
self.patch_prenorm = patch_prenorm
self.enable_checkpoint = enable_checkpoint
self.dim_embed = dim_embed
self.num_heads = num_heads
self.num_groups = num_groups
self.depths = depths
self.window_size = window_size
self.projection_dim = projection_dim
self.visual_temporal_embedding = visual_temporal_embedding
self.image_pos_embed = image_pos_embed
self.image_feature_source = image_feature_source

super().__init__(**kwargs)


class Florence2LanguageConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the BART
[facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
vocab_size (`int`, *optional*, defaults to 51289):
Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Florence2LanguageModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
num_labels (`int`, *optional*, defaults to 3):
The number of labels to use in [`Florence2LanguageForSequenceClassification`].
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.

Example:

```python
>>> from mindnlp.transformers.models.florence2 import Florence2LanguageConfig, Florence2LanguageModel

>>> # Initializing a Florence2 Language style configuration
>>> configuration = Florence2LanguageConfig()

>>> # Initializing a model (with random weights)
>>> model = Florence2LangaugeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "florence2_language"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

def __init__(
self,
vocab_size=51289,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
activation_function="gelu",
d_model=1024,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
use_cache=True,
num_labels=3,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
is_encoder_decoder=True,
decoder_start_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True

super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)


class Florence2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
Florence-2 model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
vision_config (`Florence2VisionConfig`, *optional*):
Custom vision config or dict
text_config (`Union[AutoConfig, dict]`, *optional*):
The config object of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
vocab_size (`int`, *optional*, defaults to 51289):
Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
projection_dim (`int`, *optional*, defaults to 1024):
Dimension of the multimodal projection space.

Example:

```python
>>> from mindnlp.transformers.models.florence2 import Florence2ForConditionalGeneration, Florence2Config
>>> from mindnlp.transformers.models.bart import BartConfig
>>> from mindnlp.transformers.models.clip import CLIPVisionConfig

>>> # Initializing a clip-like vision config
>>> vision_config = CLIPVisionConfig()

>>> # Initializing a Bart config
>>> text_config = BartConfig()

>>> # Initializing a Florence-2 configuration
>>> configuration = Florence2Config(vision_config, text_config)

>>> # Initializing a model from the florence-2 configuration
>>> model = Florence2ForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "florence2"
is_composition = False

def __init__(
self,
vision_config=None,
text_config=None,
ignore_index=-100,
vocab_size=51289,
projection_dim=1024,
**kwargs,
):
self.ignore_index = ignore_index
self.vocab_size = vocab_size
self.projection_dim = projection_dim
if vision_config is not None:
vision_config = PretrainedConfig(**vision_config)
self.vision_config = vision_config
self.vocab_size = self.vocab_size

self.text_config = text_config
if text_config is not None:
self.text_config = Florence2LanguageConfig(**text_config)

super().__init__(**kwargs)


__all__ = ['Florence2Config',
'Florence2VisionConfig',
'Florence2LanguageConfig']

+ 2561
- 0
mindnlp/transformers/models/florence2/modeling_florence2.py
File diff suppressed because it is too large
View File


+ 1098
- 0
mindnlp/transformers/models/florence2/processing_florence2.py View File

@@ -0,0 +1,1098 @@
# coding=utf-8
# Copyright 2024 Microsoft and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Florence-2.
"""

import re
import logging
from typing import List, Optional, Union
import numpy as np

import mindspore
from mindspore import Tensor

from mindnlp.core import ops
from mindnlp.transformers.feature_extraction_utils import BatchFeature
from mindnlp.transformers.image_utils import ImageInput, is_valid_image
from mindnlp.transformers.processing_utils import ProcessorMixin
from mindnlp.transformers.tokenization_utils_base import (
PaddingStrategy,
PreTokenizedInput,
TextInput,
TruncationStrategy,
)
from mindnlp.transformers.models.bart import BartTokenizer, BartTokenizerFast
from mindnlp.transformers.models.t5 import T5Tokenizer, T5TokenizerFast
from mindnlp.utils import TensorType


logger = logging.getLogger(__name__)


# Copied from transformers.models.idefics2.processing_idefics2.is_url
def is_url(val) -> bool:
return isinstance(val, str) and val.startswith("http")


# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
def is_image_or_image_url(elem):
return is_url(elem) or is_valid_image(elem)


def _is_str_or_image(elem):
return isinstance(elem, str) or is_image_or_image_url(elem)


class Florence2Processor(ProcessorMixin):
r"""
Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.

[`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
[`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.

Args:
image_processor ([`CLIPImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`BartTokenizerFast`], *optional*):
The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("BartTokenizer", "BartTokenizerFast")

def __init__(
self,
image_processor=None,
tokenizer=None,
):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
if not hasattr(image_processor, "image_seq_length"):
raise ValueError("Image processor is missing an `image_seq_length` attribute.")

self.image_seq_length = image_processor.image_seq_length

tokens_to_add = {
'additional_special_tokens': \
tokenizer.additional_special_tokens + \
['<od>', '</od>', '<ocr>', '</ocr>'] + \
[f'<loc_{x}>' for x in range(1000)] + \
['<cap>', '</cap>', '<ncap>', '</ncap>', '<dcap>', '</dcap>', '<grounding>', '</grounding>',
'<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>',
'</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
}
tokenizer.add_special_tokens(tokens_to_add)

self.tasks_answer_post_processing_type = {
'<OCR>': 'pure_text',
'<OCR_WITH_REGION>': 'ocr',
'<CAPTION>': 'pure_text',
'<DETAILED_CAPTION>': 'pure_text',
'<MORE_DETAILED_CAPTION>': 'pure_text',
'<OD>': 'description_with_bboxes',
'<DENSE_REGION_CAPTION>': 'description_with_bboxes',
'<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
'<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
'<REGION_TO_SEGMENTATION>': 'polygons',
'<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
'<REGION_TO_CATEGORY>': 'pure_text',
'<REGION_TO_DESCRIPTION>': 'pure_text',
'<REGION_TO_OCR>': 'pure_text',
'<REGION_PROPOSAL>': 'bboxes'
}

self.task_prompts_without_inputs = {
'<OCR>': 'What is the text in the image?',
'<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
'<CAPTION>': 'What does the image describe?',
'<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
'<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
'<OD>': 'Locate the objects with category name in the image.',
'<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
'<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
}

self.task_prompts_with_input = {
'<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
'<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
'<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
'<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
'<REGION_TO_CATEGORY>': 'What is the region {input}?',
'<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
'<REGION_TO_OCR>': 'What text is in the region {input}?',
}

self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
super().__init__(image_processor, tokenizer)

def _construct_prompts(self, text):
# replace the task tokens with the task prompts if task token is in the text
prompts = []
for _text in text:
# 1. fixed task prompts without additional inputs
for task_token, task_prompt in self.task_prompts_without_inputs.items():
if task_token in _text:
assert _text == task_token, f"Task token {task_token} should be the only token in the text."
_text = task_prompt
break
# 2. task prompts with additional inputs
for task_token, task_prompt in self.task_prompts_with_input.items():
if task_token in _text:
_text = task_prompt.format(input=_text.replace(task_token, ''))
break
prompts.append(_text)
return prompts

def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
tokenize_newline_separately: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length=None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.MINDSPORE,
do_resize: bool = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
input_data_format: Optional[
Union[str, "ChannelDimension"] # noqa: F821
] = None,
resample: "PILImageResampling" = None, # noqa: F821
do_convert_rgb: bool = None,
do_thumbnail: bool = None,
do_align_long_axis: bool = None,
do_rescale: bool = None,
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
tokenize_newline_separately (`bool`, defaults to `True`):
Adds a separately tokenized '\n' at the end of the prompt.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:

- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
is provided, the `input_ids` will also contain the suffix input ids.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **labels** -- Labels compatible with training if `suffix` is not None
"""

return_token_type_ids = False

if images is None:
raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
if text is None:
logger.warning(
"You are using Florence-2 without a text prompt."
)
text = ""

if isinstance(text, List) and isinstance(images, List):
if len(images) < len(text):
raise ValueError(
f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
)
if _is_str_or_image(text):
text = [text]
elif isinstance(text, list) and _is_str_or_image(text[0]):
pass

pixel_values = self.image_processor(
images,
do_resize=do_resize,
do_normalize=do_normalize,
return_tensors=return_tensors,
image_mean=image_mean,
image_std=image_std,
input_data_format=input_data_format,
data_format=data_format,
resample=resample,
do_convert_rgb=do_convert_rgb,
)["pixel_values"]

if max_length is not None:
max_length -= self.image_seq_length # max_length has to account for the image tokens

text = self._construct_prompts(text)

inputs = self.tokenizer(
text,
return_tensors=return_tensors,
padding=padding,
max_length=max_length,
truncation=truncation,
return_token_type_ids=return_token_type_ids,
)

return_data = {**inputs, "pixel_values": pixel_values}

if return_token_type_ids:
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
return_data.update({"labels": labels})
return BatchFeature(data=return_data)

# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)

# Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)

@property
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

def post_process_generation(self, text, task, image_size):
"""
Post-process the output of the model to each of the task outputs.

Args:
text (`str`): The text to post-process.
task (`str`): The task to post-process the text for.
image_size (`Tuple[int, int]`): The size of the image. height x width.
"""

task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
task_answer = self.post_processor(
text=text,
image_size=image_size,
parse_tasks=task_answer_post_processing_type,
)[task_answer_post_processing_type]

if task_answer_post_processing_type == 'pure_text':
final_answer = task_answer
# remove the special tokens
final_answer = final_answer.replace('<s>', '').replace('</s>', '')
elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
od_instances = task_answer
bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
elif task_answer_post_processing_type in ['ocr']:
bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
labels = [str(_od_instance['text']) for _od_instance in task_answer]
final_answer = {'quad_boxes': bboxes, 'labels': labels}
elif task_answer_post_processing_type in ['phrase_grounding']:
bboxes = []
labels = []
for _grounded_phrase in task_answer:
for _bbox in _grounded_phrase['bbox']:
bboxes.append(_bbox)
labels.append(_grounded_phrase['cat_name'])
final_answer = {'bboxes': bboxes, 'labels': labels}
elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
labels = []
polygons = []
for result in task_answer:
label = result['cat_name']
_polygons = result['polygons']
labels.append(label)
polygons.append(_polygons)
final_answer = {'polygons': polygons, 'labels': labels}
elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
bboxes = []
bboxes_labels = []
polygons = []
polygons_labels = []
for result in task_answer:
label = result['cat_name']
if 'polygons' in result:
_polygons = result['polygons']
polygons.append(_polygons)
polygons_labels.append(label)
else:
_bbox = result['bbox']
bboxes.append(_bbox)
bboxes_labels.append(label)
final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
else:
raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))

final_answer = {
task: final_answer}
return final_answer


class BoxQuantizer:
def __init__(self, mode, bins):
self.mode = mode
self.bins = bins

def quantize(self, boxes: Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
xmin, ymin, xmax, ymax = boxes.split(1, axis=-1) # Shape: 4 * [N, 1].

if self.mode == 'floor':
quantized_xmin = (
xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
quantized_ymin = (
ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
quantized_xmax = (
xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
quantized_ymax = (
ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)

elif self.mode == 'round':
raise NotImplementedError()

else:
raise ValueError('Incorrect quantization type.')

quantized_boxes = ops.cat(
(quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
).int()

return quantized_boxes

def dequantize(self, boxes: Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
xmin, ymin, xmax, ymax = boxes.split(1, axis=-1) # Shape: 4 * [N, 1].

if self.mode == 'floor':
# Add 0.5 to use the center position of the bin as the coordinate.
dequantized_xmin = (xmin + 0.5) * size_per_bin_w
dequantized_ymin = (ymin + 0.5) * size_per_bin_h
dequantized_xmax = (xmax + 0.5) * size_per_bin_w
dequantized_ymax = (ymax + 0.5) * size_per_bin_h

elif self.mode == 'round':
raise NotImplementedError()

else:
raise ValueError('Incorrect quantization type.')

dequantized_boxes = ops.cat(
(dequantized_xmin, dequantized_ymin,
dequantized_xmax, dequantized_ymax), dim=-1
)

return dequantized_boxes


class CoordinatesQuantizer:
"""
Quantize coornidates (Nx2)
"""

def __init__(self, mode, bins):
self.mode = mode
self.bins = bins

def quantize(self, coordinates: Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
x, y = coordinates.split(1, axis=-1) # Shape: 4 * [N, 1].

if self.mode == 'floor':
quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)

elif self.mode == 'round':
raise NotImplementedError()

else:
raise ValueError('Incorrect quantization type.')

quantized_coordinates = ops.cat(
(quantized_x, quantized_y), dim=-1
).int()

return quantized_coordinates

def dequantize(self, coordinates: Tensor, size):
bins_w, bins_h = self.bins # Quantization bins.
size_w, size_h = size # Original image size.
size_per_bin_w = size_w / bins_w
size_per_bin_h = size_h / bins_h
assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
x, y = coordinates.split(1, axis=-1) # Shape: 4 * [N, 1].

if self.mode == 'floor':
# Add 0.5 to use the center position of the bin as the coordinate.
dequantized_x = (x + 0.5) * size_per_bin_w
dequantized_y = (y + 0.5) * size_per_bin_h

elif self.mode == 'round':
raise NotImplementedError()

else:
raise ValueError('Incorrect quantization type.')

dequantized_coordinates = ops.cat(
(dequantized_x, dequantized_y), dim=-1
)

return dequantized_coordinates


class Florence2PostProcesser:
"""
Florence-2 post process for converting text prediction to various tasks results.

Args:
config: A dict of configs.
tokenizer: A tokenizer for decoding text to spans.
sample config:
UNIFIED_POST_PROCESS:
# commom configs
NUM_BBOX_HEIGHT_BINS: 1000
NUM_BBOX_WIDTH_BINS: 1000
COORDINATES_HEIGHT_BINS: 1000
COORDINATES_WIDTH_BINS: 1000
# task specific configs, override the common configs
PRASE_TASKS:
- TASK_NAME: 'video_dense_caption'
PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
SCORE_MODE: 'avg_cat_name_scores'
NUM_BINS: 100
- TASK_NAME: 'od'
PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
SCORE_MODE: 'avg_cat_name_scores'

Returns:
parsed_dict (dict): A dict of parsed results.
"""
def __init__(
self,
tokenizer=None
):
parse_tasks = []
parse_task_configs = {}
config = self._create_default_config()
for task in config['PARSE_TASKS']:
parse_tasks.append(task['TASK_NAME'])
parse_task_configs[task['TASK_NAME']] = task

self.config = config
self.parse_tasks = parse_tasks
self.parse_tasks_configs = parse_task_configs

self.tokenizer = tokenizer
if self.tokenizer is not None:
self.all_special_tokens = set(self.tokenizer.all_special_tokens)

self.init_quantizers()
self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()

def _create_black_list_of_phrase_grounding(self):
black_list = {}

if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
black_list = set(
['it', 'I', 'me', 'mine',
'you', 'your', 'yours',
'he', 'him', 'his',
'she', 'her', 'hers',
'they', 'them', 'their', 'theirs',
'one', 'oneself',
'we', 'us', 'our', 'ours',
'you', 'your', 'yours',
'they', 'them', 'their', 'theirs',
'mine', 'yours', 'his', 'hers', 'its',
'ours', 'yours', 'theirs',
'myself', 'yourself', 'himself', 'herself', 'itself',
'ourselves', 'yourselves', 'themselves',
'this', 'that',
'these', 'those',
'who', 'whom', 'whose', 'which', 'what',
'who', 'whom', 'whose', 'which', 'that',
'all', 'another', 'any', 'anybody', 'anyone', 'anything',
'each', 'everybody', 'everyone', 'everything',
'few', 'many', 'nobody', 'none', 'one', 'several',
'some', 'somebody', 'someone', 'something',
'each other', 'one another',
'myself', 'yourself', 'himself', 'herself', 'itself',
'ourselves', 'yourselves', 'themselves',
'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
'other objects', 'lots', 'a set',
]
)

return black_list

def _create_default_config(self):
config = {
'NUM_BBOX_HEIGHT_BINS': 1000,
'NUM_BBOX_WIDTH_BINS': 1000,
'BOX_QUANTIZATION_MODE': 'floor',
'COORDINATES_HEIGHT_BINS': 1000,
'COORDINATES_WIDTH_BINS': 1000,
'COORDINATES_QUANTIZATION_MODE': 'floor',
'PARSE_TASKS': [
{
'TASK_NAME': 'od',
'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
},
{
'TASK_NAME': 'ocr',
'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
'AREA_THRESHOLD': 0.00
},
{
'TASK_NAME': 'phrase_grounding',
'FILTER_BY_BLACK_LIST': True
},
{
'TASK_NAME': 'pure_text',
},
{
'TASK_NAME': 'description_with_bboxes',
},
{
'TASK_NAME': 'description_with_polygons',
},
{
'TASK_NAME': 'polygons',
},
{
'TASK_NAME': 'bboxes',
},
{
'TASK_NAME': 'description_with_bboxes_or_polygons',
}
]
}

return config

def init_quantizers(self):
# we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
self.box_quantizer = BoxQuantizer(
box_quantization_mode,
(num_bbox_width_bins, num_bbox_height_bins),
)

num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
self.coordinates_quantizer = CoordinatesQuantizer(
box_quantization_mode,
(num_bbox_width_bins, num_bbox_height_bins),
)

def decode_with_spans(self, tokenizer, token_ids):
filtered_tokens = tokenizer.convert_ids_to_tokens(
token_ids, skip_special_tokens=False)
assert len(filtered_tokens) == len(token_ids)

# To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separately for added tokens and byte-level tokens
# cf. https://github.com/huggingface/transformers/issues/1133
sub_texts = []
for token in filtered_tokens:
if token in self.all_special_tokens:
sub_texts.append(token)
else:
if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
sub_text = tokenizer.convert_tokens_to_string([token])
elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
# Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
# Note: Do not strip sub_text as it may have functional whitespace
sub_text = token.replace('▁', ' ')
else:
raise ValueError(f'type {type(tokenizer)} not supported')
sub_texts.append(sub_text)

text = ''
spans = []
for sub_text in sub_texts:
span = (len(text), len(text) + len(sub_text)) # [start index, end index).
text += sub_text
spans.append(span)

# Text format:
# 1. T5Tokenizer/T5TokenizerFast:
# "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
# Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
# 2. BartTokenizer (need to double check):
# "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
# Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
return text, spans

def parse_od_from_text_and_spans(
self,
text,
pattern,
image_size,
phrase_centric=False
):
parsed = list(re.finditer(pattern, text))

instances = []
for i in range(len(parsed)):
# Prepare instance.
instance = {}

if phrase_centric:
bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
else:
bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
instance['bbox'] = self.box_quantizer.dequantize(
boxes=mindspore.tensor(bbox_bins),
size=image_size
).tolist()

if phrase_centric:
instance['cat_name'] = parsed[i].group(1).lower().strip()
else:
instance['cat_name'] = parsed[i].group(5).lower().strip()
instances.append(instance)

return instances

def parse_ocr_from_text_and_spans(self,
text,
pattern,
image_size,
area_threshold=-1.0,
):
bboxes = []
labels = []
text = text.replace('<s>', '')
# ocr with regions
parsed = re.findall(pattern, text)
instances = []
image_width, image_height = image_size

for ocr_line in parsed:
ocr_content = ocr_line[0]
quad_box = ocr_line[1:]
quad_box = [int(i) for i in quad_box]
quad_box = self.coordinates_quantizer.dequantize(
mindspore.tensor(np.array(quad_box).reshape(-1, 2)),
size=image_size
).reshape(-1).tolist()

if area_threshold > 0:
x_coords = list(quad_box[0::2])
y_coords = list(quad_box[1::2])

# apply the Shoelace formula
area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))

if area < (image_width * image_height) * area_threshold:
continue

bboxes.append(quad_box)
labels.append(ocr_content)
instances.append({
'quad_box': quad_box,
'text': ocr_content,
})
return instances

def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
# ignore <s> </s> and <pad>
cur_span = 0
if text.startswith('<s>'):
cur_span += 3

text = text.replace('<s>', '')
text = text.replace('</s>', '')
text = text.replace('<pad>', '')

pattern = r"([^<]+(?:<loc_\d+>){4,})"
phrases = re.findall(pattern, text)

# pattern should be text pattern and od pattern
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'

instances = []
for pharse_text in phrases:
phrase_text_strip = pharse_text.replace('<ground>', '', 1)
phrase_text_strip = pharse_text.replace('<obj>', '', 1)

if phrase_text_strip == '':
cur_span += len(pharse_text)
continue

# Prepare instance.
instance = {}

# parse phrase, get string
phrase = re.search(pattern, phrase_text_strip)
if phrase is None:
cur_span += len(pharse_text)
continue

# parse bboxes by box_pattern
bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
if len(bboxes_parsed) == 0:
cur_span += len(pharse_text)
continue

phrase = phrase.group()
# remove leading and trailing spaces
phrase = phrase.strip()

if phrase in self.black_list_of_phrase_grounding:
cur_span += len(pharse_text)
continue

# a list of list
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
instance['bbox'] = self.box_quantizer.dequantize(
boxes=mindspore.tensor(bbox_bins),
size=image_size
).tolist()

# exclude non-ascii characters
phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
instance['cat_name'] = phrase

instances.append(instance)

return instances

def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
# temporary parse solution, split by '.'
# ignore <s> </s> and <pad>

text = text.replace('<s>', '')
text = text.replace('</s>', '')
text = text.replace('<pad>', '')

if allow_empty_phrase:
pattern = r"(?:(?:<loc_\d+>){{4,}})"
else:
pattern = r"([^<]+(?:<loc_\d+>){4,})"
phrases = re.findall(pattern, text)

# pattern should be text pattern and od pattern
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'

instances = []
for pharse_text in phrases:
phrase_text_strip = pharse_text.replace('<ground>', '', 1)
phrase_text_strip = pharse_text.replace('<obj>', '', 1)

if phrase_text_strip == '' and not allow_empty_phrase:
continue

# parse phrase, get string
phrase = re.search(pattern, phrase_text_strip)
if phrase is None:
continue

phrase = phrase.group()
# remove leading and trailing spaces
phrase = phrase.strip()

# parse bboxes by box_pattern
bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
if len(bboxes_parsed) == 0:
continue

# a list of list
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]

bboxes = self.box_quantizer.dequantize(
boxes=mindspore.tensor(bbox_bins),
size=image_size
).tolist()

phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
for _bboxes in bboxes:
# Prepare instance.
instance = {}
instance['bbox'] = _bboxes
# exclude non-ascii characters
instance['cat_name'] = phrase
instances.append(instance)

return instances

def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
allow_empty_phrase=False,
polygon_sep_token='<sep>',
polygon_start_token='<poly>',
polygon_end_token='</poly>',
with_box_at_start=False,
):

# ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
# ignore <s> </s> and <pad>

text = text.replace('<s>', '')
text = text.replace('</s>', '')
text = text.replace('<pad>', '')

if allow_empty_phrase:
pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
else:
# [^<]+: This part matches one or more characters that are not the < symbol.
# The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
#
pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
phrases = re.findall(pattern, text)

phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'

# one polygons instance is separated by polygon_start_token and polygon_end_token
polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'

instances = []
for phrase_text in phrases:

# exclude loc_\d+>
# need to get span if want to include category score
phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)

# phrase = phrase.replace('<poly>', '')
# phrase = phrase.replace('poly>', '')

if phrase_text_strip == '' and not allow_empty_phrase:
continue

# parse phrase, get string
phrase = re.search(phrase_string_pattern, phrase_text_strip)
if phrase is None:
continue
phrase = phrase.group()
# remove leading and trailing spaces
phrase = phrase.strip()

# parse bboxes by box_pattern

# split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
else:
polygons_instances_parsed = [phrase_text]

for _polygons_instances_parsed in polygons_instances_parsed:
# Prepare instance.
instance = {}

# polygons_parsed= list(re.finditer(box_pattern, phrase_text))
if isinstance(_polygons_instances_parsed, str):
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
else:
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
if len(polygons_parsed) == 0:
continue

# a list of list (polygon)
bbox = []
polygons = []
for _polygon_parsed in polygons_parsed:
# group 1: whole <loc_\d+>...</loc_\d+>
_polygon = _polygon_parsed.group(1)
# parse into list of int
_polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
if with_box_at_start and len(bbox) == 0:
if len(_polygon) > 4:
# no valid bbox prediction
bbox = _polygon[:4]
_polygon = _polygon[4:]
else:
bbox = [0, 0, 0, 0]
# abandon last element if is not paired
if len(_polygon) % 2 == 1:
_polygon = _polygon[:-1]

# reshape into (n, 2)
_polygon = self.coordinates_quantizer.dequantize(
mindspore.tensor(np.array(_polygon).reshape(-1, 2)),
size=image_size
).reshape(-1).tolist()
# reshape back
polygons.append(_polygon)

instance['cat_name'] = phrase
instance['polygons'] = polygons
if len(bbox) != 0:
instance['bbox'] = self.box_quantizer.dequantize(
boxes=mindspore.tensor([bbox]),
size=image_size
).tolist()[0]

instances.append(instance)

return instances

def __call__(
self,
text=None,
image_size=None,
parse_tasks=None,
):
"""
Args:
text: model outputs
image_size: (width, height)
parse_tasks: a list of tasks to parse, if None, parse all tasks.

"""
if parse_tasks is not None:
if isinstance(parse_tasks, str):
parse_tasks = [parse_tasks]
for _parse_task in parse_tasks:
assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'

# sequence or text should be provided
assert text is not None, 'text should be provided'

parsed_dict = {
'text': text
}

for task in self.parse_tasks:
if parse_tasks is not None and task not in parse_tasks:
continue

pattern = self.parse_tasks_configs[task].get('PATTERN', None)

if task == 'ocr':
instances = self.parse_ocr_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
)
parsed_dict['ocr'] = instances
elif task == 'phrase_grounding':
instances = self.parse_phrase_grounding_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
parsed_dict['phrase_grounding'] = instances
elif task == 'pure_text':
parsed_dict['pure_text'] = text
elif task == 'description_with_bboxes':
instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
parsed_dict['description_with_bboxes'] = instances
elif task == 'description_with_polygons':
instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
parsed_dict['description_with_polygons'] = instances
elif task == 'polygons':
instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
allow_empty_phrase=True,
)
parsed_dict['polygons'] = instances
elif task == 'bboxes':
instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
allow_empty_phrase=True,
)
parsed_dict['bboxes'] = instances
elif task == 'description_with_bboxes_or_polygons':
if '<poly>' in text:
# only support either polygons or bboxes, not both at the same time
instances = self.parse_description_with_polygons_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
else:
instances = self.parse_description_with_bboxes_from_text_and_spans(
text,
pattern=pattern,
image_size=image_size,
)
parsed_dict['description_with_bboxes_or_polygons'] = instances
else:
raise ValueError("task {} is not supported".format(task))

return parsed_dict


__all__ = ['Florence2PostProcesser',
'Florence2Processor',]

+ 26
- 0
mindnlp/transformers/models/fsmt/__init__.py View File

@@ -0,0 +1,26 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
fsmt Model.
"""
from . import tokenization_fsmt, configuration_fsmt, modeling_fsmt

from .configuration_fsmt import *
from .tokenization_fsmt import *
from .modeling_fsmt import *

__all__ = []
__all__.extend(configuration_fsmt.__all__)
__all__.extend(tokenization_fsmt.__all__)
__all__.extend(modeling_fsmt.__all__)

+ 216
- 0
mindnlp/transformers/models/fsmt/configuration_fsmt.py View File

@@ -0,0 +1,216 @@
# coding=utf-8
# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""FSMT configuration"""

from mindnlp.utils import logging
from ...configuration_utils import PretrainedConfig

logger = logging.get_logger(__name__)


class DecoderConfig(PretrainedConfig):
r"""
Configuration class for FSMT's decoder specific things. note: this is a private helper class
"""

model_type = "fsmt_decoder"

def __init__(self, vocab_size=0, bos_token_id=0):
super().__init__()
self.vocab_size = vocab_size
self.bos_token_id = bos_token_id


class FSMTConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to instantiate a FSMT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the FSMT
[facebook/wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
langs (`List[str]`):
A list with source language and target_language (e.g., ['en', 'ru']).
src_vocab_size (`int`):
Vocabulary size of the encoder. Defines the number of different tokens that can be represented by the
`inputs_ids` passed to the forward method in the encoder.
tgt_vocab_size (`int`):
Vocabulary size of the decoder. Defines the number of different tokens that can be represented by the
`inputs_ids` passed to the forward method in the decoder.
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `Callable`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
scale_embedding (`bool`, *optional*, defaults to `True`):
Scale embeddings by diving by sqrt(d_model).
bos_token_id (`int`, *optional*, defaults to 0)
Beginning of stream token id.
pad_token_id (`int`, *optional*, defaults to 1)
Padding token id.
eos_token_id (`int`, *optional*, defaults to 2)
End of stream token id.
decoder_start_token_id (`int`, *optional*):
This model starts decoding with `eos_token_id`
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
Google "layerdrop arxiv", as its not explainable in one line.
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
Google "layerdrop arxiv", as its not explainable in one line.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether this is an encoder/decoder model.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie input and output embeddings.
num_beams (`int`, *optional*, defaults to 5)
Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
no beam search.
length_penalty (`float`, *optional*, defaults to 1)
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
`length_penalty` < 0.0 encourages shorter sequences.
early_stopping (`bool`, *optional*, defaults to `False`)
Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
when at least `num_beams` sentences are finished per batch or not.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.

Examples:

```python
>>> from transformers import FSMTConfig, FSMTModel

>>> # Initializing a FSMT facebook/wmt19-en-ru style configuration
>>> config = FSMTConfig()

>>> # Initializing a model (with random weights) from the configuration
>>> model = FSMTModel(config)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "fsmt"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

# update the defaults from config file
def __init__(
self,
langs=["en", "de"],
src_vocab_size=42024,
tgt_vocab_size=42024,
activation_function="relu",
d_model=1024,
max_length=200,
max_position_embeddings=1024,
encoder_ffn_dim=4096,
encoder_layers=12,
encoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_ffn_dim=4096,
decoder_layers=12,
decoder_attention_heads=16,
decoder_layerdrop=0.0,
attention_dropout=0.0,
dropout=0.1,
activation_dropout=0.0,
init_std=0.02,
decoder_start_token_id=2,
is_encoder_decoder=True,
scale_embedding=True,
tie_word_embeddings=False,
num_beams=5,
length_penalty=1.0,
early_stopping=False,
use_cache=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
**common_kwargs,
):
self.langs = langs
self.src_vocab_size = src_vocab_size
self.tgt_vocab_size = tgt_vocab_size
self.d_model = d_model # encoder_embed_dim and decoder_embed_dim

self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = self.num_hidden_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.max_position_embeddings = max_position_embeddings
self.init_std = init_std # Normal(0, this parameter)
self.activation_function = activation_function

self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
if "decoder" in common_kwargs:
del common_kwargs["decoder"]

self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True

# 3 Types of Dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.dropout = dropout

self.use_cache = use_cache
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
is_encoder_decoder=is_encoder_decoder,
tie_word_embeddings=tie_word_embeddings,
forced_eos_token_id=forced_eos_token_id,
max_length=max_length,
num_beams=num_beams,
length_penalty=length_penalty,
early_stopping=early_stopping,
**common_kwargs,
)

__all__ = ["FSMTConfig"]

+ 1273
- 0
mindnlp/transformers/models/fsmt/modeling_fsmt.py View File

@@ -0,0 +1,1273 @@
# coding=utf-8
# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original implementation: https://github.com/pytorch/fairseq/tree/master/examples/wmt19
# Authors:
# - @alexeib Alexei Baevski
# - @edunov Sergey Edunov
# - @michaelauli Michael Auli
# - @myleott Myle Ott
# - @nng555 Nathan Ng
# - David Grangier
# - Kyra Yee
#
# Paper: Facebook FAIR's WMT19 News Translation Task Submission https://arxiv.org/abs/1907.06616
#
"""PyTorch Fairseq model, ported from https://github.com/pytorch/fairseq/tree/master/examples/wmt19"""

import math
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import mindspore

from mindnlp.core import nn, ops
from mindnlp.core.nn import functional as F
from mindnlp.utils import logging

from ...activations import ACT2FN
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel
from .configuration_fsmt import FSMTConfig


logger = logging.get_logger(__name__)


# See all FSMT models at https://huggingface.co/models?filter=fsmt

# Porting notes:
# this one is modeled after BartModel*
#
# Currently only translation (fairseq also has weights for LM)
#
# fairseq provides weights for ru-en, en-ru and de-en, en-de pairs. All have been ported.
# - ru-en, en-ru use asymmetric vocab
# - de-en, en-de use a merged single vocab (but the code works as if they are separate)
#
# Differences with Bart:
# - not using bos token
# - 2 separate vocabs (src and target)
# - embed weights aren't tied
# - uses a model Ensemble (but that part isn't ported/implemented yet) - so we
# aren't getting as good of a BLEU score
# - uses a projection layer at the end of the decoder
# - doesn't use final_logits_bias
# - beam search: stops as soon as num_beams == len(hypos) (whereas transformers
# is not satisfied there and will continue searching until the next cycles
# aren't promising something better), comparing BLEU scores - the transformers
# algorithm is slightly superior, therefore using the latter. But if you want
# to match fairseq outputs, you need to pass ``early_stopping=True`` to ``generate()``.
#
# SinusoidalPositionalEmbedding is slightly different from Bart's - generates
# different embeddings. This implementation is copied verbatim from fairseq with
# some small changes to make it work here.
#
# Other changes:
# - doesn't support use_cache as Bart's version does
#
#
# FSMTConfig changes with BartConfig
#
# Differences with BART:
# - src/tgt vocabs aren't shared
# - token embeddings aren't shared
# - needs a language pair
# - scale_embedding are True
#
# some unused args were removed too
#
#
# TODO:
# - port model ensemble (fs uses 4 model checkpoints)
# - solve beam search discrepancies
# docstyle-ignore

"""

Here is how to compare BLEU scores against fairseq implementation:

# en-ru

export PAIR=en-ru
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq"
python examples/seq2seq/run_eval.py
facebook/wmt19-$PAIR $DATA_DIR/val.source
$SAVE_DIR/test_translations.txt
--reference_path $DATA_DIR/val.target
--score_path $SAVE_DIR/test_bleu.json
--bs $BS --task translation --num_beams $NUM_BEAMS

# (fairseq BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)


# ru-en

export PAIR=ru-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
PYTHONPATH="src:examples/seq2seq"
python examples/seq2seq/run_eval.py
facebook/wmt19-$PAIR $DATA_DIR/val.source
$SAVE_DIR/test_translations.txt
--reference_path $DATA_DIR/val.target
--score_path $SAVE_DIR/test_bleu.json
--bs $BS --task translation --num_beams $NUM_BEAMS


# (fairseq BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)


# de-en

export PAIR=de-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq"
python examples/seq2seq/run_eval.py
facebook/wmt19-$PAIR $DATA_DIR/val.source
$SAVE_DIR/test_translations.txt
--reference_path $DATA_DIR/val.target
--score_path $SAVE_DIR/test_bleu.json
--bs $BS --task translation --num_beams $NUM_BEAMS

# (fairseq BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)



# en-de

export PAIR=en-de
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq"
python examples/seq2seq/run_eval.py
facebook/wmt19-$PAIR $DATA_DIR/val.source
$SAVE_DIR/test_translations.txt
--reference_path $DATA_DIR/val.target
--score_path $SAVE_DIR/test_bleu.json
--bs $BS --task translation --num_beams $NUM_BEAMS

# (fairseq BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)

"""

def invert_mask(attention_mask):
"""Turns 1->0, 0->1, False->True, True-> False"""
assert attention_mask.dim() == 2
return attention_mask.eq(0)


def triu_onnx(x, diagonal=0):
l = x.shape[0]
arange = ops.arange(l)
mask = arange.broadcast_to((l, l))
arange = arange.unsqueeze(-1)
if diagonal:
arange = arange + diagonal
mask = mask >= arange
return x.masked_fill(mask == 0, 0)


def _prepare_fsmt_decoder_inputs(
config,
input_ids,
decoder_input_ids=None,
decoder_padding_mask=None,
causal_mask_dtype=mindspore.float32,
):
"""
Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
generation
"""
pad_token_id = config.pad_token_id
if decoder_input_ids is None:
decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
bsz, tgt_len = decoder_input_ids.shape
if decoder_padding_mask is None:
decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
else:
decoder_padding_mask = invert_mask(decoder_padding_mask)
causal_mask = triu_onnx(fill_with_neg_inf(ops.zeros(tgt_len, tgt_len, dtype=causal_mask_dtype)), 1)
return decoder_input_ids, decoder_padding_mask, causal_mask

class PretrainedFSMTModel(PreTrainedModel):
config_class = FSMTConfig
base_model_prefix = "model"

def _init_weights(self, module):
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight,mean=0.0,std=self.config.init_std)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, SinusoidalPositionalEmbedding):
pass
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight,mean=0.0,std=self.config.init_std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx] = 0

@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = mindspore.Tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]])
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs


def _make_linear_from_emb(emb):
vocab_size, emb_size = emb.weight.shape
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
lin_layer.weight.data = emb.weight.data
return lin_layer


# Helper Functions, mostly for making masks
def _check_shapes(shape_1, shape2):
if shape_1 != shape2:
raise AssertionError(f"shape mismatch: {shape_1} != {shape2}")


def shift_tokens_right(input_ids, pad_token_id):
"""Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""

# replace possible -100 values in labels by `pad_token_id`
input_ids = input_ids.masked_fill(input_ids == -100, pad_token_id)

prev_output_tokens = input_ids.copy()
index_of_eos = (input_ids.ne(pad_token_id).sum(axis=1) - 1).unsqueeze(-1)
prev_output_tokens[:, 0] = input_ids.gather_elements(1, index_of_eos).squeeze()
prev_output_tokens[:, 1:] = input_ids[:, :-1]
return prev_output_tokens


def make_padding_mask(input_ids, padding_idx=1):
"""True for pad tokens"""
padding_mask = input_ids.eq(padding_idx)
if not padding_mask.any():
padding_mask = None
return padding_mask


# Helper Modules


class EncoderLayer(nn.Module):
def __init__(self, config: FSMTConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)

def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=False):
"""
Args:
x (`mindspore.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
encoder_padding_mask (`torch.ByteTensor`): binary ByteTensor of shape
*(batch, src_len)* where padding elements are indicated by `1`.
for t_tgt, t_src is excluded (or masked out), =0 means it is
included in attention
layer_head_mask (`mindspore.Tensor`): mask for attention heads in a given layer of size
*(config.encoder_attention_heads,)*.

Returns:
encoded output of shape *(seq_len, batch, embed_dim)*
"""
residual = x
x, attn_weights = self.self_attn(
query=x,
key=x,
key_padding_mask=encoder_padding_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.self_attn_layer_norm(x)

residual = x
x = self.activation_fn(self.fc1(x))
x = F.dropout(x, p=self.activation_dropout, training=self.training)
x = self.fc2(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.final_layer_norm(x)
return x, attn_weights


class FSMTEncoder(nn.Module):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`EncoderLayer`].

Args:
config: FSMTConfig
"""

def __init__(self, config: FSMTConfig, embed_tokens):
super().__init__()
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
self.padding_idx = embed_tokens.padding_idx
self.embed_tokens = embed_tokens
embed_dim = embed_tokens.embedding_dim
# embed_dim = embed_tokens.embedding_size
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_positions = SinusoidalPositionalEmbedding(
config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
)
self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)]) # type: List[EncoderLayer]

def forward(
self,
input_ids: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
inputs_embeds: mindspore.Tensor = None,
head_mask: Optional[mindspore.Tensor] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
"""
Args:
input_ids (`mindspore.Tensor`): tokens in the source language of shape
*(batch, src_len)*
attention_mask (`mindspore.Tensor`): indicating which indices are padding tokens
inputs_embeds (`mindspore.Tensor`):
embedding vectors of shape *(batch, src_len, embed_dim)*
head_mask (`mindspore.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

Returns:
BaseModelOutput or Tuple comprised of:

- **x** (`mindspore.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
- **encoder_states** (`Tuple(mindspore.Tensor`)): all intermediate hidden states of shape *(src_len,
batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
- **all_attentions** (`Tuple(mindspore.Tensor`)): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout.
"""
# check attention mask and invert
if attention_mask is not None:
attention_mask = invert_mask(attention_mask)

if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
embed_pos = self.embed_positions(input_ids)
elif inputs_embeds is not None:
inputs_embeds = inputs_embeds * self.embed_scale

# We assume zeros hidden states correspond to padding tokens
# and create `position_ids` where inputs_embeds[:, :, 0] == 0
position_ids = inputs_embeds[:, :, 0].masked_fill(
inputs_embeds[:, :, 0].eq(0), self.embed_positions.padding_idx
)

embed_pos = self.embed_positions(position_ids)
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")

x = inputs_embeds + embed_pos
x = F.dropout(x, p=self.dropout, training=self.training)

# B x T x C -> T x B x C
x = ops.transpose(x, 0, 1)

encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.shape[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.shape[0]}."
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
x = ops.transpose(x, 0, 1) # T x B x C -> B x T x C
encoder_states += (x,)
x = ops.transpose(x, 0, 1) # B x T x C -> T x B x C
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = ops.rand([])
if self.training and (dropout_probability < self.layerdrop): # skip the layer
attn = None
else:
x, attn = encoder_layer(
x,
attention_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
output_attentions=output_attentions,
)

if output_attentions:
all_attentions = all_attentions + (attn,)

# T x B x C -> B x T x C
x = ops.transpose(x, 0, 1)

if output_hidden_states:
encoder_states += (x,)

if not return_dict:
return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)


class DecoderLayer(nn.Module):
def __init__(self, config: FSMTConfig):
super().__init__()
self.embed_dim = config.d_model

self.self_attn = Attention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout

self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = Attention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
encoder_decoder_attention=True,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)

def forward(
self,
x,
encoder_hidden_states,
encoder_attn_mask=None,
layer_state=None,
causal_mask=None,
layer_head_mask=None,
cross_attn_layer_head_mask=None,
decoder_padding_mask=None,
output_attentions=False,
):
residual = x

if layer_state is None:
layer_state = {}

# Self Attention
x, self_attn_weights = self.self_attn(
query=x,
key=x,
layer_state=layer_state, # adds keys to layer state
key_padding_mask=decoder_padding_mask,
attn_mask=causal_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.self_attn_layer_norm(x)

# Cross attention
residual = x
assert self.encoder_attn.cache_key != self.self_attn.cache_key
x, cross_attn_weights = self.encoder_attn(
query=x,
key=encoder_hidden_states,
key_padding_mask=encoder_attn_mask,
layer_state=layer_state, # mutates layer state
layer_head_mask=cross_attn_layer_head_mask,
output_attentions=output_attentions,
)
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.encoder_attn_layer_norm(x)

# Fully Connected
residual = x
x = self.activation_fn(self.fc1(x))
x = F.dropout(x, p=self.activation_dropout, training=self.training)
x = self.fc2(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.final_layer_norm(x)
return (
x,
self_attn_weights,
layer_state,
cross_attn_weights,
) # layer_state = cache for decoding


class FSMTDecoder(nn.Module):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DecoderLayer`]

Args:
config: FSMTConfig
embed_tokens (nn.Embedding): output embedding
"""

def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding):
super().__init__()
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = embed_tokens.padding_idx
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = embed_tokens
embed_dim = embed_tokens.embedding_dim
# embed_dim = embed_tokens.embedding_size
self.embed_positions = SinusoidalPositionalEmbedding(
config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
)
self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.decoder_layers)]) # type: List[DecoderLayer]

# import deepspeed

# with deepspeed.zero.GatheredParameters(self.embed_tokens.weight, modifier_rank=None):
# embed_tokens_weight_shape = self.embed_tokens.weight.shape
embed_tokens_weight_shape = self.embed_tokens.weight.shape

# else:
# embed_tokens_weight_shape = self.embed_tokens.weight.shape
self.output_projection = nn.Linear(embed_tokens_weight_shape[1], embed_tokens_weight_shape[0], bias=False)
self.output_projection.weight = self.embed_tokens.weight

def _tie_weights(self):
self.embed_tokens.weight = self.output_projection.weight

def forward(
self,
input_ids: mindspore.Tensor,
encoder_hidden_states: mindspore.Tensor,
encoder_padding_mask: mindspore.Tensor,
decoder_padding_mask: mindspore.Tensor,
decoder_causal_mask: mindspore.Tensor,
head_mask: Optional[mindspore.Tensor] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
cross_attn_head_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[List[mindspore.Tensor]] = None,
use_cache: bool = False,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
):
"""
Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
EMNLP 2019).

Args:
input_ids (`mindspore.Tensor` of shape `(batch, tgt_len)`):
previous decoder outputs for teacher forcing
encoder_hidden_states: output from the encoder, used for
encoder-side attention
encoder_padding_mask: for ignoring pad tokens
past_key_values (dict or None): dictionary used for storing state during generation
head_mask (`mindspore.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

cross_attn_head_mask (`mindspore.Tensor` of shape `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

Returns:
BaseModelOutputWithPast or tuple:

- the decoder's features of shape *(batch, tgt_len, embed_dim)*
- the cache
- hidden states
- attentions
"""
# check attention mask and invert
if encoder_padding_mask is not None:
encoder_padding_mask = invert_mask(encoder_padding_mask)

if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
# embed positions
positions = self.embed_positions(input_ids)
if use_cache:
input_ids = input_ids[:, -1:]
positions = positions[:, -1:] # happens after we embed them
x = self.embed_tokens(input_ids) * self.embed_scale
elif inputs_embeds is not None:
# We assume zeros hidden states correspond to padding tokens
# and create `position_ids` where inputs_embeds[:, :, 0] == 0
position_ids = inputs_embeds[:, :, 0].masked_fill(
inputs_embeds[:, :, 0].eq(0), self.embed_positions.padding_idx
)
positions = self.embed_positions(position_ids)
x = inputs_embeds * self.embed_scale
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

x += positions
x = F.dropout(x, p=self.dropout, training=self.training)

# Convert to FSMT output format: (BS, seq_len, model_dim) -> (seq_len, BS, model_dim)
x = ops.transpose(x, 0, 1)
encoder_hidden_states = ops.transpose(encoder_hidden_states, 0, 1)

# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_cross_attns = () if output_attentions else None
next_decoder_cache = []

# check if head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.shape[0] == (len(self.layers)), (
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
f" {head_mask.shape[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
x = ops.transpose(x, 0, 1)
all_hidden_states += (x,)
x = ops.transpose(x, 0, 1)
if self.training:
dropout_probability = ops.rand([])
if dropout_probability < self.layerdrop:
continue

layer_state = past_key_values[idx] if past_key_values is not None else None

x, layer_self_attn, layer_past, layer_cross_attn = decoder_layer(
x,
encoder_hidden_states,
encoder_attn_mask=encoder_padding_mask,
decoder_padding_mask=decoder_padding_mask,
layer_state=layer_state,
causal_mask=decoder_causal_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
output_attentions=output_attentions,
)

if use_cache:
next_decoder_cache.append(layer_past.copy())

if output_attentions:
all_self_attns += (layer_self_attn,)
all_cross_attns += (layer_cross_attn,)

# add hidden states from the last decoder layer
if output_hidden_states:
x = ops.transpose(x, 0, 1)
all_hidden_states += (x,)
x = ops.transpose(x, 0, 1)

# Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
x = ops.transpose(x, 0, 1)
encoder_hidden_states = ops.transpose(encoder_hidden_states, 0, 1)

x = self.output_projection(x)

next_cache = next_decoder_cache if use_cache else None

if not return_dict:
return tuple(
v for v in [x, next_cache, all_hidden_states, all_self_attns, all_cross_attns] if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=x,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attns,
)


def _reorder_buffer(attn_cache, new_order):
for k, input_buffer_k in attn_cache.items():
if input_buffer_k is not None:
attn_cache[k] = input_buffer_k.index_select(0, new_order)
return attn_cache


class Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(
self,
embed_dim,
num_heads,
dropout=0.0,
bias=True,
encoder_decoder_attention=False, # otherwise self_attention
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim**-0.5

self.encoder_decoder_attention = encoder_decoder_attention
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"

def _shape(self, tensor, seq_len, bsz):
return ops.transpose(tensor.view(seq_len, bsz * self.num_heads, self.head_dim), 0, 1)

def forward(
self,
query,
key: Optional[mindspore.Tensor],
key_padding_mask: Optional[mindspore.Tensor] = None,
layer_state: Optional[Dict[str, Optional[mindspore.Tensor]]] = None,
attn_mask: Optional[mindspore.Tensor] = None,
layer_head_mask: Optional[mindspore.Tensor] = None,
output_attentions=False,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]:
"""Input shape: Time(SeqLen) x Batch x Channel"""
static_kv: bool = self.encoder_decoder_attention
tgt_len, bsz, embed_dim = query.shape
assert embed_dim == self.embed_dim
assert list(query.shape) == [tgt_len, bsz, embed_dim]
# get here for encoder decoder cause of static_kv
if layer_state is not None: # reuse k,v and encoder_padding_mask
saved_state = layer_state.get(self.cache_key, {})
if "prev_key" in saved_state and static_kv:
# previous time steps are cached - no need to recompute key and value if they are static
key = None
else:
saved_state = None
layer_state = {}

q = self.q_proj(query) * self.scaling
if static_kv:
if key is None:
k = v = None
else:
k = self.k_proj(key)
v = self.v_proj(key)
else:
k = self.k_proj(query)
v = self.v_proj(query)

q = self._shape(q, tgt_len, bsz)
if k is not None:
k = self._shape(k, -1, bsz)
if v is not None:
v = self._shape(v, -1, bsz)

if saved_state is not None:
k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz)

# Update cache
layer_state[self.cache_key] = {
"prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
"prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
"prev_key_padding_mask": key_padding_mask if not static_kv else None,
}

assert k is not None
src_len = k.shape[1]
attn_weights = ops.bmm(q, ops.transpose(k, 1, 2))
assert attn_weights.shape == (bsz * self.num_heads, tgt_len, src_len)

if attn_mask is not None:
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

# This is part of a workaround to get around fork/join parallelism not supporting Optional types.
if key_padding_mask is not None and key_padding_mask.dim() == 0:
key_padding_mask = None
assert key_padding_mask is None or key_padding_mask.shape[:2] == (
bsz,
src_len,
)

if key_padding_mask is not None: # don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
attn_weights = attn_weights.masked_fill(reshaped, np.finfo(mindspore.dtype_to_nptype(attn_weights.dtype)).min)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

attn_weights = ops.softmax(attn_weights, dim=-1)

if layer_head_mask is not None:
assert layer_head_mask.shape == (
self.num_heads,
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.shape}"
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

if output_attentions:
# make sure that attn_weights are included in graph
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
else:
attn_weights_reshaped = None

attn_probs = F.dropout(
attn_weights,
p=self.dropout,
training=self.training,
)

assert v is not None
attn_output = ops.bmm(attn_probs, v)
assert attn_output.shape == (bsz * self.num_heads, tgt_len, self.head_dim)
attn_output = ops.transpose(attn_output, 0, 1).view(tgt_len, bsz, embed_dim)
attn_output = self.out_proj(attn_output)

return attn_output, attn_weights_reshaped

def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
if "prev_key" in saved_state:
_prev_key = saved_state["prev_key"]
assert _prev_key is not None
prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
if static_kv:
k = prev_key
else:
assert k is not None
k = ops.cat([prev_key, k], dim=1)
if "prev_value" in saved_state:
_prev_value = saved_state["prev_value"]
assert _prev_value is not None
prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
if static_kv:
v = prev_value
else:
assert v is not None
v = ops.cat([prev_value, v], dim=1)
assert k is not None and v is not None
prev_key_padding_mask: Optional[mindspore.Tensor] = saved_state.get("prev_key_padding_mask", None)
if prev_key_padding_mask is not None:
if static_kv:
new_key_padding_mask = prev_key_padding_mask
else:
new_key_padding_mask = ops.cat([prev_key_padding_mask, key_padding_mask], dim=1)
else:
new_key_padding_mask = key_padding_mask
return k, v, new_key_padding_mask


def fill_with_neg_inf(t):
"""FP16-compatible function that fills a input_ids with -inf."""
return t.float().fill(np.finfo(mindspore.dtype_to_nptype(t.dtype)).min).type_as(t)


# Public API
def _get_shape(t):
return getattr(t, "shape", None)


class FSMTModel(PretrainedFSMTModel):
_tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]

def __init__(self, config: FSMTConfig):
super().__init__(config)

padding_idx = config.pad_token_id
encoder_embed_tokens = nn.Embedding(config.src_vocab_size, config.d_model, padding_idx)
decoder_embed_tokens = nn.Embedding(config.tgt_vocab_size, config.d_model, padding_idx)

self.encoder = FSMTEncoder(config, encoder_embed_tokens)
self.decoder = FSMTDecoder(config, decoder_embed_tokens)

# Initialize weights and apply final processing
self.post_init()

def get_encoder(self):
return self.encoder

def get_decoder(self):
return self.decoder

def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
self._tie_or_clone_weights(self.decoder.output_projection, self.get_input_embeddings())

def forward(
self,
input_ids: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
decoder_input_ids: Optional[mindspore.Tensor] = None,
decoder_attention_mask: Optional[mindspore.Tensor] = None,
head_mask: Optional[mindspore.Tensor] = None,
decoder_head_mask: Optional[mindspore.Tensor] = None,
cross_attn_head_mask: Optional[mindspore.Tensor] = None,
encoder_outputs: Optional[Tuple[mindspore.Tensor]] = None,
past_key_values: Optional[Tuple[mindspore.Tensor]] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
decoder_inputs_embeds: Optional[mindspore.Tensor] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], Seq2SeqModelOutput]:
if decoder_input_ids is None:
use_cache = False

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# make masks if user doesn't supply
if not use_cache and input_ids is not None:
decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_fsmt_decoder_inputs(
self.config,
input_ids,
decoder_input_ids=decoder_input_ids,
decoder_padding_mask=decoder_attention_mask,
causal_mask_dtype=self.decoder.embed_tokens.weight.dtype,
)
else:
decoder_padding_mask, causal_mask = None, None

if decoder_input_ids is None and decoder_inputs_embeds is None:
raise ValueError("Make sure that `decoder_input_ids` or `decoder_inputs_embeds` are passed.")

if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)

# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
decoder_outputs = self.decoder(
decoder_input_ids,
encoder_outputs[0],
attention_mask,
decoder_padding_mask,
decoder_causal_mask=causal_mask,
inputs_embeds=decoder_inputs_embeds,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

if not return_dict:
return decoder_outputs + encoder_outputs

return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)

def get_input_embeddings(self):
return self.encoder.embed_tokens

def set_input_embeddings(self, value):
self.encoder.embed_tokens = value

def get_output_embeddings(self):
return self.decoder.embed_tokens

def set_output_embeddings(self, value):
self.decoder.embed_tokens = value


class FSMTForConditionalGeneration(PretrainedFSMTModel):
base_model_prefix = "model"
_tied_weights_keys = ["decoder.embed_tokens.weight", "decoder.output_projection.weight"]

def __init__(self, config: FSMTConfig):
super().__init__(config)
base_model = FSMTModel(config)
self.model = base_model

# Initialize weights and apply final processing
self.post_init()

def forward(
self,
input_ids: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
decoder_input_ids: Optional[mindspore.Tensor] = None,
decoder_attention_mask: Optional[mindspore.Tensor] = None,
head_mask: Optional[mindspore.Tensor] = None,
decoder_head_mask: Optional[mindspore.Tensor] = None,
cross_attn_head_mask: Optional[mindspore.Tensor] = None,
encoder_outputs: Optional[Tuple[mindspore.Tensor]] = None,
past_key_values: Optional[Tuple[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
decoder_inputs_embeds: Optional[mindspore.Tensor] = None,
labels: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor], Seq2SeqLMOutput]:
r"""
labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Returns:

"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if labels is not None:
use_cache = False

outputs = self.model(
input_ids,
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_inputs_embeds=decoder_inputs_embeds,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = outputs[0]

masked_lm_loss = None
if labels is not None:
masked_lm_loss = F.cross_entropy(lm_logits.view(-1, self.config.tgt_vocab_size), labels.view(-1))

if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)

def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
return {
"input_ids": None, # encoder_outputs is defined. input_ids not needed
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}

def prepare_decoder_input_ids_from_labels(self, labels: mindspore.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id)

@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = []
for layer_past in past_key_values:
# get the correct batch idx from decoder layer's batch dim for cross and self-attn
layer_past_new = {
attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
}
reordered_past.append(layer_past_new)
return reordered_past

def get_encoder(self):
return self.model.encoder

def get_decoder(self):
return self.model.decoder

def get_output_embeddings(self):
return self.model.decoder.embed_tokens

def set_output_embeddings(self, value):
self.model.decoder.embed_tokens = value


class SinusoidalPositionalEmbedding(nn.Embedding):
"""
This module produces sinusoidal positional embeddings of any length.

We don't want to save the weight of this embedding since it's not trained (deterministic) and it can be huge.

Padding symbols are ignored.

These embeddings get automatically extended in forward if more positions is needed.
"""

def __init__(self, num_positions, embedding_dim, padding_idx):
super().__init__(num_positions, embedding_dim, padding_idx=padding_idx)
self.make_weight(num_positions, embedding_dim, padding_idx)

def make_weight(self, num_positions, embedding_dim, padding_idx):
weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
# if not hasattr(self, "weight"):
# # in ___init__
# super().__init__(num_positions, embedding_dim, padding_idx=padding_idx, _weight=weight)
# else:
# # in forward put the weights on the correct dtype and device of the param
# weight = weight.to(dtype=self.weight.dtype)
# self.weight = mindspore.Parameter(weight)
weight = weight.to(dtype=self.weight.dtype)
self.weight = mindspore.Parameter(weight)
# self.weight.detach_()
self.weight.requires_grad = False

@staticmethod
def get_embedding(num_embeddings, embedding_dim, padding_idx):
"""
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
"""
half_dim = embedding_dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = ops.exp(ops.arange(half_dim, dtype=mindspore.int64).float() * -emb)
emb = ops.arange(num_embeddings, dtype=mindspore.int64).float().unsqueeze(1) * emb.unsqueeze(0)
emb = ops.cat([ops.sin(emb), ops.cos(emb)], dim=1).view(num_embeddings, -1)
if embedding_dim % 2 == 1:
# zero pad
emb = ops.cat([emb, ops.zeros(num_embeddings, 1)], dim=1)
if padding_idx is not None:
emb[padding_idx, :] = 0
return emb

@staticmethod
def make_positions(tensor, padding_idx: int):
"""
Replace non-padding symbols with their position numbers.

Position numbers begin at padding_idx+1. Padding symbols are ignored.
"""
# The series of casts and type-conversions here are carefully
# balanced to both work with ONNX export and XLA. In particular XLA
# prefers ints, cumsum defaults to output longs, and ONNX doesn't know
# how to handle the dtype kwarg in cumsum.
mask = tensor.ne(padding_idx).int()
return (ops.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx

def forward(
self,
input,
incremental_state: Optional[Any] = None,
timestep: Optional[mindspore.Tensor] = None,
):
"""Input is expected to be of size [bsz x seqlen]."""
bsz, seq_len = input.shape[:2]
max_pos = self.padding_idx + 1 + seq_len
if max_pos > self.weight.shape[0]:
# expand embeddings if needed
# self.make_weight(max_pos, self.embedding_size, self.padding_idx)
self.make_weight(max_pos, self.embedding_dim, self.padding_idx)
positions = self.make_positions(input, self.padding_idx)
return super().forward(positions)

__all__ = ["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"]

+ 520
- 0
mindnlp/transformers/models/fsmt/tokenization_fsmt.py View File

@@ -0,0 +1,520 @@
# coding=utf-8
# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for FSMT."""

import json
import os
import re
import unicodedata
from typing import Dict, List, Optional, Tuple

from mindnlp.utils import logging
from ...tokenization_utils import PreTrainedTokenizer


logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {
"src_vocab_file": "vocab-src.json",
"tgt_vocab_file": "vocab-tgt.json",
"merges_file": "merges.txt",
}


def get_pairs(word):
"""
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs


def replace_unicode_punct(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", "1")
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text


def remove_non_printing_char(text):
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)


# Porting notes:
# this one is modeled after XLMTokenizer
#
# added:
# - src_vocab_file,
# - tgt_vocab_file,
# - langs,


class FSMTTokenizer(PreTrainedTokenizer):
"""
Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
"__classify__") to a vocabulary.
- The argument `langs` defines a pair of languages.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
langs (`List[str]`, *optional*):
A list of two languages to translate from and to, for instance `["en", "ru"]`.
src_vocab_file (`str`, *optional*):
File containing the vocabulary for the source language.
tgt_vocab_file (`st`, *optional*):
File containing the vocabulary for the target language.
merges_file (`str`, *optional*):
File containing the merges.
do_lower_case (`bool`, *optional*, defaults to `False`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

<Tip>

When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.

</Tip>

sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.

"""

vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]

def __init__(
self,
langs=None,
src_vocab_file=None,
tgt_vocab_file=None,
merges_file=None,
do_lower_case=False,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
**kwargs,
):
try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)

self.sm = sacremoses

self.src_vocab_file = src_vocab_file
self.tgt_vocab_file = tgt_vocab_file
self.merges_file = merges_file
self.do_lower_case = do_lower_case

# cache of sm.MosesPunctNormalizer instance
self.cache_moses_punct_normalizer = {}
# cache of sm.MosesTokenizer instance
self.cache_moses_tokenizer = {}
self.cache_moses_detokenizer = {}

if langs and len(langs) == 2:
self.src_lang, self.tgt_lang = langs
else:
raise ValueError(
f"arg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got {langs}. "
"Usually that means that tokenizer can't find a mapping for the given model path "
"in and other maps of this tokenizer."
)

with open(src_vocab_file, encoding="utf-8") as src_vocab_handle:
self.encoder = json.load(src_vocab_handle)
with open(tgt_vocab_file, encoding="utf-8") as tgt_vocab_handle:
tgt_vocab = json.load(tgt_vocab_handle)
self.decoder = {v: k for k, v in tgt_vocab.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
super().__init__(
langs=langs,
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
**kwargs,
)

# hack override
def get_vocab(self) -> Dict[str, int]:
return self.get_src_vocab()

# hack override
@property
def vocab_size(self) -> int:
return self.src_vocab_size

def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
return self.cache_moses_punct_normalizer[lang].normalize(text)

def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
return self.cache_moses_tokenizer[lang].tokenize(
text, aggressive_dash_splits=True, return_str=False, escape=True
)

def moses_detokenize(self, tokens, lang):
if lang not in self.cache_moses_detokenizer:
moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
self.cache_moses_detokenizer[lang] = moses_detokenizer
return self.cache_moses_detokenizer[lang].detokenize(tokens)

def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text

@property
def src_vocab_size(self):
return len(self.encoder)

@property
def tgt_vocab_size(self):
return len(self.decoder)

def get_src_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)

def get_tgt_vocab(self):
return dict(self.decoder, **self.added_tokens_decoder)

def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)

if not pairs:
return token + "</w>"

while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j

if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word

def _tokenize(self, text, lang="en", bypass_tokenizer=False):
"""
Tokenize a string given language code using Moses.

Details of tokenization:

- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`

Args:
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
languages. However, we don't enforce it.
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
(bool). If True, we only apply BPE.

Returns:
List of tokens.
"""
# ignore `lang` which is currently isn't explicitly passed in tokenization_utils.py and always results in lang=en
# if lang != self.src_lang:
# raise ValueError(f"Expected lang={self.src_lang}, but got {lang}")
lang = self.src_lang

if self.do_lower_case:
text = text.lower()

if bypass_tokenizer:
text = text.split()
else:
text = self.moses_pipeline(text, lang=lang)
text = self.moses_tokenize(text, lang=lang)

split_tokens = []
for token in text:
if token:
split_tokens.extend(list(self.bpe(token).split(" ")))

return split_tokens

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""

# remove BPE
tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
tokens = "".join(tokens).split()
# detokenize
text = self.moses_detokenize(tokens, self.tgt_lang)
return text

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A FAIRSEQ Transformer sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
sep = [self.sep_token_id]

# no bos used in fairseq
if token_ids_1 is None:
return token_ids_0 + sep
return token_ids_0 + sep + token_ids_1 + sep

def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.

Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""

if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# no bos used in fairseq
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return ([0] * len(token_ids_0)) + [1]

def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
Transformer sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).

Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
"""
sep = [self.sep_token_id]

# no bos used in fairseq
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0]
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return

src_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["src_vocab_file"]
)
tgt_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tgt_vocab_file"]
)
merges_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)

with open(src_vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

with open(tgt_vocab_file, "w", encoding="utf-8") as f:
tgt_vocab = {v: k for k, v in self.decoder.items()}
f.write(json.dumps(tgt_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

index = 0
with open(merges_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1

return src_vocab_file, tgt_vocab_file, merges_file

def __getstate__(self):
state = self.__dict__.copy()
state["sm"] = None
return state

def __setstate__(self, d):
self.__dict__ = d

try:
import sacremoses
except ImportError:
raise ImportError(
"You need to install sacremoses to use XLMTokenizer. "
"See https://pypi.org/project/sacremoses/ for installation."
)

self.sm = sacremoses

__all__ = ["FSMTTokenizer"]

+ 28
- 0
mindnlp/transformers/models/fuyu/__init__.py View File

@@ -0,0 +1,28 @@
# Copyright 2023 AdeptAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fuyu Model.
"""
from . import configuration_fuyu, image_processing_fuyu,modeling_fuyu, processing_fuyu

from .configuration_fuyu import *
from .modeling_fuyu import *
from .image_processing_fuyu import *
from .processing_fuyu import *

__all__ = []
__all__.extend(configuration_fuyu.__all__)
__all__.extend(modeling_fuyu.__all__)
__all__.extend(image_processing_fuyu.__all__)
__all__.extend(processing_fuyu.__all__)

+ 229
- 0
mindnlp/transformers/models/fuyu/configuration_fuyu.py View File

@@ -0,0 +1,229 @@
# coding=utf-8
# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fuyu model configuration"""

import warnings

from mindnlp.utils import logging
from ...configuration_utils import PretrainedConfig
from ..auto import CONFIG_MAPPING


logger = logging.get_logger(__name__)


class FuyuConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the
[adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
vocab_size (`int`, *optional*, defaults to 262144):
Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`FuyuForCausalLM`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 16384):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 36):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 16384):
The maximum sequence length that this model might ever be used with.
image_size (`int`, *optional*, defaults to 300):
The input image size.
patch_size (`int`, *optional*, defaults to 30):
The input vision transformer encoding patch size.
num_channels (`int`, *optional*, defaults to 3):
The input image number of channels.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`. Whether to tie weight embeddings
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie input and output embeddings.
rope_theta (`float`, *optional*, defaults to 25000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
qk_layernorm (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the Queries and Keys after projecting the hidden states
hidden_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio after applying the MLP to the hidden states.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio after computing the attention scores.
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding.

pad_token_id (`int`, *optional*):
The id of the *padding* token.
bos_token_id (`int`, *optional*, defaults to 1):
The id of the *beginning-of-sequence* token.
eos_token_id (`Union[int, List[int]]`, *optional*, defaults to 2):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize the `language``[`Aut`].

```python
>>> from transformers import FuyuConfig

>>> # Initializing a Fuyu fuyu-7b style configuration
>>> configuration = FuyuConfig()
```"""

model_type = "fuyu"
keys_to_ignore_at_inference = ["past_key_values"]

def __init__(
self,
vocab_size=262144,
hidden_size=4096,
intermediate_size=16384,
num_hidden_layers=36,
num_attention_heads=64,
hidden_act="relu2",
max_position_embeddings=16384,
image_size=300,
patch_size=30,
num_channels=3,
initializer_range=0.02,
layer_norm_eps=1e-5,
use_cache=True,
tie_word_embeddings=False,
rope_theta=25000.0,
rope_scaling=None,
qk_layernorm=True,
hidden_dropout=0.0,
attention_dropout=0.0,
partial_rotary_factor=0.5,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
text_config=None,
**kwargs,
):
if text_config is None:
text_config = {
"vocab_size": vocab_size,
"max_position_embeddings": max_position_embeddings,
"hidden_size": hidden_size,
"intermediate_size": intermediate_size,
"num_hidden_layers": num_hidden_layers,
"num_attention_heads": num_attention_heads,
"hidden_act": hidden_act,
"initializer_range": initializer_range,
"layer_norm_eps": layer_norm_eps,
"use_cache": use_cache,
"rope_theta": rope_theta,
"rope_scaling": rope_scaling,
"qk_layernorm": qk_layernorm,
"hidden_dropout": hidden_dropout,
"attention_dropout": attention_dropout,
"partial_rotary_factor": partial_rotary_factor,
"pad_token_id": pad_token_id,
"bos_token_id": bos_token_id,
"eos_token_id": eos_token_id,
"tie_word_embeddings": tie_word_embeddings,
}
logger.info("text_config is None. initializing the text model with default values.")
text_model_type = text_config["model_type"] if "model_type" in text_config else "persimmon"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)

self._vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.qk_layernorm = qk_layernorm
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.partial_rotary_factor = partial_rotary_factor
self._rope_scaling_validation()

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return

if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

@property
def vocab_size(self):
warnings.warn(
"The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
FutureWarning,
)
return self._vocab_size

@vocab_size.setter
def vocab_size(self, value):
self._vocab_size = value

def to_dict(self):
output = super().to_dict()
output.pop("_vocab_size", None)
return output

__all__ = ["FuyuConfig"]

+ 734
- 0
mindnlp/transformers/models/fuyu/image_processing_fuyu.py View File

@@ -0,0 +1,734 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Fuyu."""

import math
from typing import Dict, List, Optional, Union

import numpy as np

from mindnlp.utils import (
TensorType,
is_mindspore_available,
logging,
)

from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import (
pad,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
is_valid_image,
make_list_of_images,
to_numpy_array,
validate_preprocess_arguments,
)


if is_mindspore_available():
import mindspore
from mindnlp.core import ops


logger = logging.get_logger(__name__)


def make_list_of_list_of_images(
images: Union[List[List[ImageInput]], List[ImageInput], ImageInput],
) -> List[List[ImageInput]]:
if is_valid_image(images):
return [[images]]

if isinstance(images, list) and all(isinstance(image, list) for image in images):
return images

if isinstance(images, list):
return [make_list_of_images(image) for image in images]

raise ValueError("images must be a list of list of images or a list of images or an image.")


class FuyuBatchFeature(BatchFeature):
"""
BatchFeature class for Fuyu image processor and processor.

The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
"""

def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
"""
Convert the inner content to tensors.

Args:
tensor_type (`str` or [`~utils.TensorType`], *optional*):
The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
`None`, no modification is done.
"""
if tensor_type is None:
return self

is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type=tensor_type)

def _convert_tensor(elem):
if is_tensor(elem):
return elem
return as_tensor(elem)

def _safe_convert_tensor(elem):
try:
return _convert_tensor(elem)
except: # noqa E722
if key == "overflowing_values":
raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
raise ValueError(
"Unable to create tensor, you should probably activate padding "
"with 'padding=True' to have batched tensors with the same length."
)

# Do the tensor conversion in batch
for key, value in self.items():
if isinstance(value, list) and isinstance(value[0], list):
# List[List[Any]] -> List[List[Tensor]]
self[key] = [[_safe_convert_tensor(elem) for elem in elems] for elems in value]
elif isinstance(value, list):
# List[Any] -> List[Tensor]
self[key] = [_safe_convert_tensor(elem) for elem in value]
else:
# Any -> Tensor
self[key] = _safe_convert_tensor(value)
return self

# def to(self, *args, **kwargs) -> "BatchFeature":
# """
# Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
# different `dtypes` and sending the `BatchFeature` to a different `device`.

# Args:
# args (`Tuple`):
# Will be passed to the `to(...)` function of the tensors.
# kwargs (`Dict`, *optional*):
# Will be passed to the `to(...)` function of the tensors.

# Returns:
# [`BatchFeature`]: The same instance after modification.
# """

# new_data = {}
# device = kwargs.get("device")
# # Check if the args are a device or a dtype
# if device is None and len(args) > 0:
# # device should be always the first argument
# arg = args[0]
# if is_torch_dtype(arg):
# # The first argument is a dtype
# pass
# elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
# device = arg
# else:
# # it's something else
# raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")

# def _to(elem):
# # check if v is a floating point
# if torch.is_floating_point(elem):
# # cast and send to device
# return elem.to(*args, **kwargs)
# if device is not None:
# return elem.to(device=device)

# return elem

# # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
# for k, v in self.items():
# if isinstance(v, list) and isinstance(v[0], list):
# # Data structure is a list of lists
# new_v = []
# for elems in v:
# new_v.append([_to(elem) for elem in elems])
# new_data[k] = new_v
# elif isinstance(v, list):
# # Data structure is a list
# new_data[k] = [_to(elem) for elem in v]
# else:
# new_data[k] = _to(v)
# self.data = new_data
# return self


class FuyuImageProcessor(BaseImageProcessor):
"""
This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
handle:

- Processing Images:
Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
dimensions. The image output is always img_h, img_w of (1080, 1920)

Then, it patches up these images using the patchify_image function.

- Creating Image Input IDs:
For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
variable-sized images, each line of patches is terminated with a newline ID.

- Image Patch Indices:
For each image patch, the code maintains an index where these patches should be inserted in a token stream.


Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image to `size`.
size (`Dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image to `size`.
padding_value (`float`, *optional*, defaults to 1.0):
The value to pad the image with.
padding_mode (`str`, *optional*, defaults to `"constant"`):
The padding mode to use when padding the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float`, *optional*, defaults to 0.5):
The mean to use when normalizing the image.
image_std (`float`, *optional*, defaults to 0.5):
The standard deviation to use when normalizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `1 / 255`):
The factor to use when rescaling the image.
patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""

model_input_names = [
"images",
"image_input_ids",
"image_patches",
"image_patch_indices_per_batch",
"image_patch_indices_per_subsequence",
]

def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_pad: bool = True,
padding_value: float = 1.0,
padding_mode: str = "constant",
do_normalize: bool = True,
image_mean: Union[float, List[float]] = 0.5,
image_std: Union[float, List[float]] = 0.5,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
patch_size: Optional[Dict[str, int]] = None,
**kwargs,
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size if size is not None else {"height": 1080, "width": 1920}
self.resample = resample
self.do_pad = do_pad
self.padding_value = padding_value
self.padding_mode = padding_mode
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_pad",
"padding_value",
"padding_mode",
"do_normalize",
"image_mean",
"image_std",
"do_rescale",
"rescale_factor",
"patch_size",
"return_tensors",
"data_format",
"input_data_format",
]

def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.

Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

Returns:
`np.ndarray`: The resized image.
"""
image_height, image_width = get_image_size(image, input_data_format)
target_height, target_width = size["height"], size["width"]

if image_width <= target_width and image_height <= target_height:
return image

height_scale_factor = target_height / image_height
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)

new_height = int(image_height * optimal_scale_factor)
new_width = int(image_width * optimal_scale_factor)

scaled_image = resize(
image=image,
size=(new_height, new_width),
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return scaled_image

def pad_image(
self,
image: np.ndarray,
size: Dict[str, int],
mode: str = "constant",
constant_values: float = 1.0,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad an image to `(size["height"], size["width"])`.

Args:
image (`np.ndarray`):
Image to pad.
size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
data_format (`ChannelDimension` or `str`, *optional*):
The data format of the output image. If unset, the same format as the input image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
image_height, image_width = get_image_size(image, input_data_format)
target_height, target_width = size["height"], size["width"]
padding_top = 0
padding_left = 0
padding_bottom = target_height - image_height
padding_right = target_width - image_width
padded_image = pad(
image,
padding=((padding_top, padding_bottom), (padding_left, padding_right)),
mode=mode,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image

def preprocess(
self,
images,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: Optional[PILImageResampling] = None,
do_pad: Optional[bool] = None,
padding_value: Optional[float] = None,
padding_mode: Optional[str] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[float] = None,
image_std: Optional[float] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
patch_size: Optional[Dict[str, int]] = None,
data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
return_tensors: Optional[TensorType] = None,
):
"""

Utility function to preprocess the images and extract necessary information about original formats.

Args:
images (`ImageInput`):
Images to preprocess. Expects a single image, a list or images or a list of lists of images. Pixel
values range from 0 to 255, or between 0 and 1 if `do_rescale` is `False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image to `size`.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
Whether to pad the image to `size`.
padding_value (`float`, *optional*, defaults to `self.padding_value`):
The value to pad the image with.
padding_mode (`str`, *optional*, defaults to `self.padding_mode`):
The padding mode to use when padding the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float`, *optional*, defaults to `self.image_mean`):
The mean to use when normalizing the image.
image_std (`float`, *optional*, defaults to `self.image_std`):
The standard deviation to use when normalizing the image.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
The factor to use when rescaling the image.
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format of the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""

do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_pad = do_pad if do_pad is not None else self.do_pad
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
padding_value = padding_value if padding_value is not None else self.padding_value
padding_mode = padding_mode if padding_mode is not None else self.padding_mode
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
patch_size = patch_size if patch_size is not None else self.patch_size

if isinstance(images, list) and any(isinstance(elem, list) and len(elem) >= 2 for elem in images):
raise ValueError("Multiple images for a single sample are not yet supported.")

batch_images = make_list_of_list_of_images(images)

validate_preprocess_arguments(
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_pad=do_pad,
size_divisibility=size, # There is no pad divisibility in this processor, but pad requires the size arg.
do_resize=do_resize,
size=size,
resample=resample,
)
# All transformations expect numpy arrays.
batch_images = [[to_numpy_array(image) for image in images] for images in batch_images]

if is_scaled_image(batch_images[0][0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)

if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(batch_images[0][0])

original_image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]

if do_resize:
batch_images = [
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
for images in batch_images
]

image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]

# scale_h is the same as scale_w
image_scale_factors = [
[resized_size[0] / original_size[0]]
for original_size, resized_size in zip(original_image_sizes, image_sizes)
]

if do_pad:
batch_images = [
[
self.pad_image(
image,
size=size,
mode=padding_mode,
constant_values=padding_value,
input_data_format=input_data_format,
)
for image in images
]
for images in batch_images
]

if do_rescale:
batch_images = [
[self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images]
for images in batch_images
]

if do_normalize:
batch_images = [
[
self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
for images in batch_images
]

if data_format is not None:
batch_images = [
[to_channel_dimension_format(image, data_format, input_data_format) for image in images]
for images in batch_images
]

data = {
"images": batch_images,
"image_unpadded_heights": image_unpadded_heights,
"image_unpadded_widths": image_unpadded_widths,
"image_scale_factors": image_scale_factors,
}
return FuyuBatchFeature(data=data, tensor_type=return_tensors)

def get_num_patches(self, image_height: int, image_width: int, patch_size: Dict[str, int] = None) -> int:
"""
Calculate number of patches required to encode an image.

Args:
image_height (`int`):
Height of the image.
image_width (`int`):
Width of the image.
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]

if image_height % patch_height != 0:
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
if image_width % patch_width != 0:
raise ValueError(f"{image_width=} must be divisible by {patch_width}")

num_patches_per_dim_h = image_height // patch_height
num_patches_per_dim_w = image_width // patch_width
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
return num_patches

def patchify_image(self, image: "mindspore.Tensor", patch_size: Optional[Dict[str, int]] = None) -> "mindspore.Tensor":
"""
Convert an image into a tensor of patches.

Args:
image (`torch.Tensor`):
Image to convert. Shape: [batch, channels, height, width]
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
"""
# requires_backends(self, ["torch"])
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = patch_size["height"], patch_size["width"]

# TODO refer to https://github.com/ArthurZucker/transformers/blob/0f0a3fe5ca5697ee58faeb5b53f049af720b5e98/src/transformers/models/vit_mae/modeling_vit_mae.py#L871
# torch implementation is faster but does not handle non-squares

batch_size, channels, _, _ = image.shape
unfolded_along_height = ops.unfold(image, 2, mindspore.Tensor(patch_height), patch_height)
patches = ops.unfold(unfolded_along_height, 3, mindspore.Tensor(patch_width), patch_width)
patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
patches = patches.permute(0, 2, 3, 4, 1)
patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
return patches

def preprocess_with_tokenizer_info(
self,
image_input: "mindspore.Tensor",
image_present: "mindspore.Tensor",
image_unpadded_h: "mindspore.Tensor",
image_unpadded_w: "mindspore.Tensor",
image_placeholder_id: int,
image_newline_id: int,
variable_sized: bool,
patch_size: Optional[Dict[str, int]] = None,
) -> FuyuBatchFeature:
"""Process images for model input. In particular, variable-sized images are handled here.

Args:
image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
Tensor of images padded to model input size.
image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
Tensor of 1s and 0s indicating whether an image is present.
image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
Tensor of unpadded image heights.
image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
Tensor of unpadded image widths.
image_placeholder_id (int):
The id of the image placeholder token. Comes from an associated tokenizer.
image_newline_id (int):
The id of the image newline token. Comes from an associated tokenizer.
variable_sized (bool):
Whether to process images as variable-sized.
patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
Size of the patches.
"""
# requires_backends(self, ["torch"])

patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = patch_size["height"], patch_size["width"]

# Only images that are present.
images: List[List[mindspore.Tensor]] = []
batch_image_patches: List[List[mindspore.Tensor]] = []
# Image input ids for every subsequence, including ones with no image present.
batch_image_input_ids: List[List[mindspore.Tensor]] = []
for batch_index in range(image_input.shape[0]):
image_input_ids = []
image_patches = []
for subseq_index in range(image_input.shape[1]):
if image_present[batch_index, subseq_index]:
image = image_input[batch_index, subseq_index]
image_height, image_width = image.shape[1], image.shape[2]
if variable_sized:
# The min() is required here due to floating point issues:
# math.ceil(torch.tensor(300).cuda() / 30) == 11
new_h = min(
image_height,
math.ceil(image_unpadded_h[batch_index, subseq_index].to(mindspore.float32) / patch_height) * patch_height,
)
new_w = min(
image_width,
math.ceil(image_unpadded_w[batch_index, subseq_index].to(mindspore.float32) / patch_width) * patch_width,
)
image = image[:, :new_h, :new_w]
image_height, image_width = new_h, new_w

num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
tensor_of_image_ids = ops.full(
[num_patches], image_placeholder_id, dtype=mindspore.int32
)
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
assert num_patches == patches.shape[0]

if variable_sized:
# Now terminate each line with |NEWLINE|.
tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
newline_ids = ops.full(
[tensor_of_image_ids.shape[0], 1],
image_newline_id,
dtype=mindspore.int32,
)
tensor_of_image_ids = ops.cat([tensor_of_image_ids, newline_ids], dim=1)
tensor_of_image_ids = tensor_of_image_ids.reshape(-1)

images.append([image])
image_input_ids.append(tensor_of_image_ids)
image_patches.append(patches)
else:
image_input_ids.append(mindspore.tensor([], dtype=mindspore.int32))

batch_image_input_ids.append(image_input_ids)
batch_image_patches.append(image_patches)

# Create image_patch_input_indices, where non-negative values correspond to image patches to be inserted in
# the stream.
image_patch_indices_per_batch: List[List[mindspore.Tensor]] = []
image_patch_indices_per_subsequence: List[List[mindspore.Tensor]] = []

for sample_image_input_ids in batch_image_input_ids:
index_offset = 0
per_batch_indices = []
per_subsequence_indices = []
for subseq_image_input_ids in sample_image_input_ids:
# Indices of image patches.
patches_mask = subseq_image_input_ids == image_placeholder_id
num_patches = ops.count_nonzero(patches_mask.to(mindspore.int64),dim=())
indices = ops.arange(num_patches, dtype=mindspore.int64).type_as(
subseq_image_input_ids
)

# Place those indices in the image input ids token stream, with -1 representing non-index tokens.
indices_in_stream_per_batch = ops.full_like(subseq_image_input_ids, -1)
indices_in_stream_per_subsequence = ops.full_like(subseq_image_input_ids, -1)
patches_inds = ops.nonzero(patches_mask.to(mindspore.int64), as_tuple=True)[0]
# patches_inds = mindspore.ops.nonzero(patches_mask.to(mindspore.int64)).reshape(-1)

indices_in_stream_per_batch[patches_inds] = indices + index_offset
indices_in_stream_per_subsequence[patches_inds] = indices

per_batch_indices.append(indices_in_stream_per_batch)
per_subsequence_indices.append(indices_in_stream_per_subsequence)
index_offset += num_patches

image_patch_indices_per_batch.append(per_batch_indices)
image_patch_indices_per_subsequence.append(per_subsequence_indices)

return FuyuBatchFeature(
data={
"images": images,
"image_input_ids": batch_image_input_ids,
"image_patches": batch_image_patches,
"image_patch_indices_per_batch": image_patch_indices_per_batch,
"image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
}
)

__all__ = ["FuyuImageProcessor"]

+ 295
- 0
mindnlp/transformers/models/fuyu/modeling_fuyu.py View File

@@ -0,0 +1,295 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mindspore Fuyu model."""

from typing import List, Optional, Tuple, Union

import mindspore

from mindnlp.core import nn, ops
from mindnlp.utils import logging

from ...modeling_outputs import CausalLMOutputWithPast
from ...modeling_utils import PreTrainedModel
from ...models.auto.modeling_auto import AutoModelForCausalLM

from .configuration_fuyu import FuyuConfig

logger = logging.get_logger(__name__)

class FuyuPreTrainedModel(PreTrainedModel):
config_class = FuyuConfig
base_model_prefix = "fuyu"
supports_gradient_checkpointing = True
_no_split_modules = []
_skip_keys_device_placement = "past_key_values"

def _init_weights(self, module):
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight,mean=0.0,std=self.config.initializer_range)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight,mean=0.0,std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx] = 0

class FuyuForCausalLM(FuyuPreTrainedModel):
def __init__(self, config: FuyuConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.text_config.vocab_size
# self.language_model = AutoModelForCausalLM.from_config(
# config.text_config, attn_implementation=config._attn_implementation
# )
self.language_model = AutoModelForCausalLM.from_config(
config.text_config
)

self.vision_embed_tokens = nn.Linear(
config.patch_size * config.patch_size * config.num_channels, config.hidden_size
)

self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self):
return self.language_model.get_input_embeddings()

def set_input_embeddings(self, value):
self.language_model.set_input_embeddings(value)

def get_output_embeddings(self):
return self.language_model.get_output_embeddings()

def set_output_embeddings(self, new_embeddings):
self.language_model.set_output_embeddings(new_embeddings)

def set_decoder(self, decoder):
self.language_model.set_decoder(decoder)

def get_decoder(self):
return self.language_model.get_decoder()

def tie_weights(self):
return self.language_model.tie_weights()

def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
# TODO: config.vocab_size is deprecated and will be removed in v4.43.
# `resize_token_embeddings` should work from `modeling_utils.py``
model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self.config.text_config.vocab_size = model_embeds.num_embeddings
self.config.vocab_size = model_embeds.num_embeddings
self.vocab_size = model_embeds.num_embeddings
# self.config.text_config.vocab_size = model_embeds.vocab_size
# self.config.vocab_size = model_embeds.vocab_size
# self.vocab_size = model_embeds.vocab_size
return model_embeds

def gather_continuous_embeddings(
self,
word_embeddings: mindspore.Tensor,
continuous_embeddings: List[mindspore.Tensor],
image_patch_input_indices: mindspore.Tensor,
) -> mindspore.Tensor:
"""This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Tensor of word embeddings.
continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
[num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
indices in image_patch_input_indices for that batch element.
image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Tensor of indices of the image patches in the input_ids tensor.
"""
if not (word_embeddings.shape[0] == len(continuous_embeddings)):
raise ValueError(
f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
)
output_embeddings = word_embeddings.copy()
for batch_idx in range(word_embeddings.shape[0]):
# First, find the positions of all the non-negative values in image_patch_input_indices, those are the
# positions in word_embeddings that we want to replace with content from continuous_embeddings.

dst_indices = ops.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
# dst_indices = mindspore.ops.nonzero(image_patch_input_indices[batch_idx] >= 0).reshape(-1)

# Next look up those indices in image_patch_input_indices to find the indices in continuous_embeddings that we
# want to use to replace the values in word_embeddings.
src_indices = image_patch_input_indices[batch_idx][dst_indices]
# Check if we have more indices than embeddings. Note that we could have fewer indices if images got truncated.
if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
raise ValueError(
f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
)
output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices]
return output_embeddings

def forward(
self,
input_ids: mindspore.Tensor = None,
image_patches: mindspore.Tensor = None, # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
image_patches_indices: mindspore.Tensor = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_values: Optional[List[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
labels: Optional[mindspore.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Returns:

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import requests

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache

return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either input_is or inputs_embeds")

seq_length_with_past = seq_length
past_key_values_length = 0

if past_key_values is not None:
past_key_values_length = past_key_values[0][0].shape[2]
seq_length_with_past = seq_length_with_past + past_key_values_length

if position_ids is None:
# device = input_ids.device if input_ids is not None else inputs_embeds.device
position_ids = ops.arange(
past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
)
position_ids = position_ids.unsqueeze(0)

if inputs_embeds is None:
inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
if image_patches is not None and past_key_values is None:
patch_embeddings = [
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
.squeeze(0)
for patch in image_patches
]
inputs_embeds = self.gather_continuous_embeddings(
word_embeddings=inputs_embeds,
continuous_embeddings=patch_embeddings,
image_patch_input_indices=image_patches_indices,
)

outputs = self.language_model(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
labels=labels,
use_cache=use_cache,
return_dict=return_dict,
)

return outputs

def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
image_patches=None,
image_patches_indices=None,
**kwargs,
):
if past_key_values:
input_ids = input_ids[:, -1:]

position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids = position_ids.masked_fill(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)

# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}

if image_patches_indices is not None:
model_inputs["image_patches_indices"] = image_patches_indices

model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
"image_patches_indices": image_patches_indices if past_key_values is None else None,
"image_patches": image_patches if past_key_values is None else None,
}
)
return model_inputs

__all__ = [
"FuyuForCausalLM",
"FuyuPreTrainedModel",
]

+ 700
- 0
mindnlp/transformers/models/fuyu/processing_fuyu.py View File

@@ -0,0 +1,700 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for GIT
"""

import re
from typing import Dict, List, Optional, Tuple, Union

import numpy as np

from mindnlp.utils import TensorType, is_mindspore_available, logging

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, TruncationStrategy


if is_mindspore_available():
from .image_processing_fuyu import FuyuBatchFeature


logger = logging.get_logger(__name__)


if is_mindspore_available():
import mindspore
from mindnlp.core import ops


TEXT_REPR_BBOX_OPEN = "<box>"
TEXT_REPR_BBOX_CLOSE = "</box>"
TEXT_REPR_POINT_OPEN = "<point>"
TEXT_REPR_POINT_CLOSE = "</point>"

TOKEN_BBOX_OPEN_STRING = "<0x00>" # <bbox>
TOKEN_BBOX_CLOSE_STRING = "<0x01>" # </bbox>
TOKEN_POINT_OPEN_STRING = "<0x02>" # <point>
TOKEN_POINT_CLOSE_STRING = "<0x03>" # </point>
BEGINNING_OF_ANSWER_STRING = "<0x04>" # <boa>


def full_unpacked_stream_to_tensor(
all_bi_tokens_to_place: List[int],
full_unpacked_stream: List["mindspore.Tensor"],
fill_value: int,
batch_size: int,
new_seq_len: int,
offset: int,
) -> "mindspore.Tensor":
"""Takes an unpacked stream of tokens (i.e. a list of tensors, one for each item in the batch) and does
the required padding to create a single tensor for the batch of shape batch_size x new_seq_len.
"""

assert len(all_bi_tokens_to_place) == batch_size
assert len(full_unpacked_stream) == batch_size

# Create padded tensors for the full batch.
new_padded_tensor = ops.full(
[batch_size, new_seq_len],
fill_value=fill_value,
dtype=full_unpacked_stream[0].dtype,
)

# Place each batch entry into the batch tensor.
for bi in range(batch_size):
tokens_to_place = all_bi_tokens_to_place[bi]
new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]

return new_padded_tensor


def construct_full_unpacked_stream(
num_real_text_tokens: Union[List[List[int]], "mindspore.Tensor"],
input_stream: "mindspore.Tensor",
image_tokens: List[List["mindspore.Tensor"]],
batch_size: int,
num_sub_sequences: int,
) -> List["mindspore.Tensor"]:
"""Takes an input_stream tensor of shape B x S x ?. For each subsequence, adds any required
padding to account for images and then unpacks the subsequences to create a single sequence per item in the batch.
Returns a list of tensors, one for each item in the batch."""

all_bi_stream = []

for batch_index in range(batch_size):
all_si_stream = []

# First, construct full token stream (including image placeholder tokens) and loss mask for each subsequence
# and append to lists. We use lists rather than tensors because each subsequence is variable-sized.
# TODO Remove this logic in a subsequent release since subsequences are not supported.
image_adjustment = image_tokens[batch_index][0].to(input_stream.dtype)
subsequence_stream = ops.cat([image_adjustment, input_stream[batch_index, 0]], dim=0)
num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[batch_index][0]
all_si_stream.append(subsequence_stream[:num_real_tokens])
all_bi_stream.append(ops.cat(all_si_stream, dim=0))

return all_bi_stream


def _replace_string_repr_with_token_tags(prompt: str) -> str:
prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
return prompt


def _segment_prompt_into_text_token_conversions(prompt: str) -> List:
"""
Given a string prompt, converts the prompt into a list of TextTokenConversions.
"""
# Wherever, we notice the [TOKEN_OPEN_STRING, TOKEN_CLOSE_STRING], we split the prompt
prompt_text_list: List = []
regex_pattern = re.compile(
f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
)
# Split by the regex pattern
prompt_split = regex_pattern.split(prompt)
for i, elem in enumerate(prompt_split):
if len(elem) == 0 or elem in [
TOKEN_BBOX_OPEN_STRING,
TOKEN_BBOX_CLOSE_STRING,
TOKEN_POINT_OPEN_STRING,
TOKEN_POINT_CLOSE_STRING,
]:
continue
prompt_text_list.append(
(elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
)
return prompt_text_list


def _transform_coordinates_and_tokenize(prompt: str, scale_factor: float, tokenizer) -> List[int]:
"""
This function transforms the prompt in the following fashion:
- <box> <point> and </box> </point> to their respective token mappings
- extract the coordinates from the tag
- transform the coordinates into the transformed image space
- return the prompt tokens with the transformed coordinates and new tags

Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
and punctuation added above are NOT optional.
"""
# Make a namedtuple that stores "text" and "is_bbox"

# We want to do the following: Tokenize the code normally -> when we see a point or box, tokenize using the tokenize_within_tag function
# When point or box close tag, continue tokenizing normally
# First, we replace the point and box tags with their respective tokens
prompt = _replace_string_repr_with_token_tags(prompt)
# Tokenize the prompt
# Convert prompt into a list split
prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
transformed_prompt_tokens: List[int] = []
for elem in prompt_text_list:
if elem[1]:
# This is a location, we need to tokenize it
within_tag_tokenized = _transform_within_tags(elem[0], scale_factor, tokenizer)
# Surround the text with the open and close tags
transformed_prompt_tokens.extend(within_tag_tokenized)
else:
transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
return transformed_prompt_tokens


def _transform_within_tags(text: str, scale_factor: float, tokenizer) -> List[int]:
"""
Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
"""
# Convert the text into a list of strings.
num_int_strs = text.split(",")
if len(num_int_strs) == 2:
# If there are any open or close tags, remove them.
token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
else:
token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]

# Remove all spaces from num_ints
num_ints = [float(num.strip()) for num in num_int_strs]
# scale to transformed image siz
if len(num_ints) == 2:
num_ints_translated = scale_point_to_transformed_image(x=num_ints[0], y=num_ints[1], scale_factor=scale_factor)
elif len(num_ints) == 4:
num_ints_translated = scale_bbox_to_transformed_image(
top=num_ints[0],
left=num_ints[1],
bottom=num_ints[2],
right=num_ints[3],
scale_factor=scale_factor,
)
else:
raise ValueError(f"Invalid number of ints: {len(num_ints)}")
# Tokenize the text, skipping the
tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
return [token_space_open_string] + tokens + [token_space_close_string]


def _tokenize_prompts_with_image_and_batch(
tokenizer,
prompts: List[List[str]],
scale_factors: Optional[List[List["mindspore.Tensor"]]],
max_tokens_to_generate: int,
max_position_embeddings: int,
add_BOS: bool, # Same issue with types as above
add_beginning_of_answer_token: bool,
) -> Tuple["mindspore.Tensor", "mindspore.Tensor"]:
"""
Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
- pad all the sequences to this length so we can convert them into a 3D tensor.
"""

# If not tool use, tranform the coordinates while tokenizing
if scale_factors is not None:
transformed_prompt_tokens = []
for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
transformed_prompt_tokens.append(
[
_transform_coordinates_and_tokenize(prompt, scale_factor.item(), tokenizer)
for prompt, scale_factor in zip(prompt_seq, scale_factor_seq)
]
)
else:
transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]

prompts_tokens = transformed_prompt_tokens

if add_BOS:
bos_token = tokenizer.vocab["<s>"]
else:
bos_token = tokenizer.vocab["|ENDOFTEXT|"]
prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
if add_beginning_of_answer_token:
boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
# Only add bbox open token to the last subsequence since that is what will be completed
for token_seq in prompts_tokens:
token_seq[-1].append(boa)

# Now we have a list of list of tokens which each list has a different
# size. We want to extend this list to:
# - incorporate the tokens that need to be generated
# - make all the sequences equal length.
# Get the prompts length.

prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]
# Get the max prompts length.
max_prompt_len: int = np.max(prompts_length)
# Number of tokens in the each sample of the batch.
samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)
if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
logger.warning(
f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
)
# Now update the list of list to be of the same size: samples_length.
for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
if len(prompt_tokens) > samples_length:
raise ValueError("Length of subsequence prompt exceeds sequence length.")
padding_size = samples_length - prompt_length
prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)

# Now we are in a structured format, we can convert to tensors.
prompts_tokens_tensor = mindspore.tensor(prompts_tokens, dtype=mindspore.int64)
prompts_length_tensor = mindspore.tensor(prompts_length, dtype=mindspore.int64)

return prompts_tokens_tensor, prompts_length_tensor


# Simplified assuming self.crop_top = self.padding_top = 0
def original_to_transformed_h_coords(original_coords, scale_h):
return np.round(original_coords * scale_h).astype(np.int32)


# Simplified assuming self.crop_left = self.padding_left = 0
def original_to_transformed_w_coords(original_coords, scale_w):
return np.round(original_coords * scale_w).astype(np.int32)


def scale_point_to_transformed_image(x: float, y: float, scale_factor: float) -> List[int]:
x_scaled = original_to_transformed_w_coords(np.array([x / 2]), scale_factor)[0]
y_scaled = original_to_transformed_h_coords(np.array([y / 2]), scale_factor)[0]
return [x_scaled, y_scaled]


def scale_bbox_to_transformed_image(
top: float, left: float, bottom: float, right: float, scale_factor: float
) -> List[int]:
top_scaled = original_to_transformed_w_coords(np.array([top / 2]), scale_factor)[0]
left_scaled = original_to_transformed_h_coords(np.array([left / 2]), scale_factor)[0]
bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]), scale_factor)[0]
right_scaled = original_to_transformed_h_coords(np.array([right / 2]), scale_factor)[0]
return [top_scaled, left_scaled, bottom_scaled, right_scaled]


class FuyuProcessor(ProcessorMixin):
r"""
Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.

[`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.

Args:
image_processor ([`FuyuImageProcessor`]):
The image processor is a required input.
tokenizer ([`LlamaTokenizerFast`]):
The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
valid_kwargs = []
image_processor_class = "FuyuImageProcessor"
tokenizer_class = "AutoTokenizer"

def __init__(self, image_processor, tokenizer, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer)
self.image_processor = image_processor
self.tokenizer = tokenizer
self.max_tokens_to_generate = 10
self.max_position_embeddings = 16384 # TODO Can't derive this from model files: where to set it?
self.pad_token_id = 0
self.dummy_image_index = -1

def _left_pad_inputs_with_attention_mask(self, model_inputs: List[Dict], return_attention_mask: bool):
max_length_input_ids = max(entry["input_ids"].shape[1] for entry in model_inputs)
max_length_image_patch_indices = max(entry["image_patches_indices"].shape[1] for entry in model_inputs)

batched_inputs = {"input_ids": [], "image_patches": [], "image_patches_indices": [], "attention_mask": []}

for entry in model_inputs:
for key, tensor in entry.items():
if key == "input_ids":
num_padding_tokens = max_length_input_ids - tensor.shape[1]
padded_input_ids = ops.cat(
[
ops.full((tensor.shape[0], num_padding_tokens), self.pad_token_id, dtype=mindspore.int64),
tensor,
],
dim=1,
)
batched_inputs[key].append(padded_input_ids)

attention_mask = ops.cat(
[ops.zeros((tensor.shape[0], num_padding_tokens), dtype=mindspore.int64), ops.ones_like(tensor)],
dim = 1,
)
batched_inputs["attention_mask"].append(attention_mask)

elif key == "image_patches":
# For image_patches, we don't pad but just append them to the list.
batched_inputs[key].append(tensor)

else: # for image_patches_indices
num_padding_indices = max_length_image_patch_indices - tensor.shape[1]
padded_indices = ops.cat(
[
ops.full(
(tensor.shape[0], num_padding_indices), self.dummy_image_index, dtype=mindspore.int64
),
tensor,
],
dim=1,
)
batched_inputs[key].append(padded_indices)
batched_keys = ["input_ids", "image_patches_indices"]
if return_attention_mask:
batched_keys.append("attention_mask")
for key in batched_keys:
batched_inputs[key] = ops.cat(batched_inputs[key], dim=0)
return batched_inputs

def get_sample_encoding(
self,
prompts,
scale_factors,
image_unpadded_heights,
image_unpadded_widths,
image_placeholder_id,
image_newline_id,
tensor_batch_images,
):
image_present = ops.ones((1, 1, 1))
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
image_input=tensor_batch_images,
image_present=image_present,
image_unpadded_h=image_unpadded_heights,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
)
# max_tokens_to_generate is embedded into this processor's call.
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
tokenizer=self.tokenizer,
prompts=prompts,
scale_factors=scale_factors,
max_tokens_to_generate=self.max_tokens_to_generate,
max_position_embeddings=self.max_position_embeddings,
add_BOS=True,
add_beginning_of_answer_token=True,
)
image_padded_unpacked_tokens = construct_full_unpacked_stream(
num_real_text_tokens=prompts_length,
input_stream=prompt_tokens,
image_tokens=model_image_input["image_input_ids"],
batch_size=1,
num_sub_sequences=self.subsequence_length,
)
# Construct inputs for image patch indices.
unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
num_real_text_tokens=prompts_length,
input_stream=ops.full_like(prompt_tokens, -1),
image_tokens=model_image_input["image_patch_indices_per_batch"],
batch_size=1,
num_sub_sequences=self.subsequence_length,
)
max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
tokens_to_place = min(max_seq_len_batch, max(0, image_padded_unpacked_tokens[0].shape[0]))

# Use same packing logic for the image patch indices.
image_patch_input_indices = full_unpacked_stream_to_tensor(
all_bi_tokens_to_place=[tokens_to_place],
full_unpacked_stream=unpacked_image_patch_indices_per_batch,
fill_value=-1,
batch_size=1,
new_seq_len=max_seq_len_batch,
offset=0,
)
image_patches_tensor = ops.stack([img[0] for img in model_image_input["image_patches"]])
batch_encoding = {
"input_ids": image_padded_unpacked_tokens[0].unsqueeze(0),
"image_patches": image_patches_tensor,
"image_patches_indices": image_patch_input_indices,
}
return batch_encoding

def __call__(
self,
text=None,
images=None,
add_special_tokens: bool = True,
return_attention_mask: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_token_type_ids: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> "FuyuBatchFeature":
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.

Args:
text (`str`, `List[str]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `List[PIL.Image.Image]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.

Returns:
[`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:

- **input_ids** -- Tensor of token ids to be fed to a model. Returned when `text` is not `None`.
- **image_patches** -- List of Tensor of image patches. Returned when `images` is not `None`.
- **image_patches_indices** -- Tensor of indices where patch embeddings have to be inserted by the model.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model when
`return_attention_mask=True`.
"""
# requires_backends(self, ["torch"])

# --- Check input validity ---
if not return_attention_mask:
raise ValueError("`return_attention_mask=False` is not supported for this model.")
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be None.")
if text is not None and images is None:
logger.warning("You are processing a text with no associated image. Make sure it is intended.")
self.current_processor = self.tokenizer
text_encoding = self.tokenizer(
text=text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_token_type_ids=return_token_type_ids,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
return text_encoding

if text is None and images is not None:
logger.warning("You are processing an image with no associated text. Make sure it is intended.")
prompts = [[""]]
if text is not None and images is not None:
if isinstance(text, str):
prompts = [[text]]
elif isinstance(text, list):
prompts = [[text_seq] for text_seq in text]

# --- Preprocess images using self.image_processor ---

# We hard code "pt" here because the rest of the processing assumes torch tensors
image_encoding = self.image_processor.preprocess(images, return_tensors="ms")
batch_images = image_encoding["images"]
image_unpadded_heights = image_encoding["image_unpadded_heights"]
image_unpadded_widths = image_encoding["image_unpadded_widths"]
scale_factors = image_encoding["image_scale_factors"]
self.subsequence_length = 1 # Each batch contains only one sequence.
self.batch_size = len(batch_images)

# --- Use self.tokenizer to get the ids of special tokens to insert into image ids ---

image_placeholder_id = self.tokenizer("|SPEAKER|", add_special_tokens=False)["input_ids"][1]
image_newline_id = self.tokenizer("|NEWLINE|", add_special_tokens=False)["input_ids"][1]
tensor_batch_images = ops.stack([img[0] for img in batch_images]).unsqueeze(1)

# --- Use self.image_processor again to obtain the full token ids and batch inputs ---
all_encodings = []

for prompt, scale_factor, image_unpadded_height, image_unpadded_width, tensor_batch_image in zip(
prompts, scale_factors, image_unpadded_heights, image_unpadded_widths, tensor_batch_images
):
sample_encoding = self.get_sample_encoding(
prompts=[prompt],
scale_factors=[scale_factor],
# image_unpadded_heights=mindspore.tensor([image_unpadded_height]),
# image_unpadded_widths=mindspore.tensor([image_unpadded_width]),
image_unpadded_heights=mindspore.tensor(np.array([[image_unpadded_height[0].numpy()]])),
image_unpadded_widths=mindspore.tensor(np.array([[image_unpadded_width[0].numpy()]])),
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
tensor_batch_images=tensor_batch_image.unsqueeze(0),
)
all_encodings.append(sample_encoding)
batch_encoding = self._left_pad_inputs_with_attention_mask(
model_inputs=all_encodings, return_attention_mask=return_attention_mask
)
return FuyuBatchFeature(data=batch_encoding)

def post_process_box_coordinates(self, outputs, target_sizes=None):
"""
Transforms raw coordinates detected by [`FuyuForCausalLM`] to the original images' coordinate space.
Coordinates will be returned in "box" format, with the following pattern:
`<box>top, left, bottom, right</box>`

Point coordinates are not supported yet.

Args:
outputs ([`GenerateOutput`]):
Raw outputs from `generate`.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, found coordinates in the output sequence are rescaled to the target sizes. If left
to None, coordinates will not be rescaled.

Returns:
`GenerateOutput`: Same output type returned by `generate`, with output token ids replaced with
boxed and possible rescaled coordinates.
"""

def scale_factor_to_fit(original_size, target_size=None):
height, width = original_size
if target_size is None:
max_height = self.image_processor.size["height"]
max_width = self.image_processor.size["width"]
else:
max_height, max_width = target_size
if width <= max_width and height <= max_height:
return 1.0
return min(max_height / height, max_width / width)

def find_delimiters_pair(tokens, start_token, end_token):
start_id = self.tokenizer.convert_tokens_to_ids(start_token)
end_id = self.tokenizer.convert_tokens_to_ids(end_token)

starting_positions = (tokens == start_id).nonzero().reshape(-1)
ending_positions = (tokens == end_id).nonzero().reshape(-1)

if ops.any(starting_positions) and ops.any(ending_positions):
return (starting_positions[0], ending_positions[0])
return (None, None)

def tokens_to_boxes(tokens, original_size):
while (pair := find_delimiters_pair(tokens, TOKEN_BBOX_OPEN_STRING, TOKEN_BBOX_CLOSE_STRING)) != (
None,
None,
):
start, end = pair
if end != start + 5:
continue

# Retrieve transformed coordinates from tokens
coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])

# Scale back to original image size and multiply by 2
scale = scale_factor_to_fit(original_size)
top, left, bottom, right = [2 * int(float(c) / scale) for c in coords]

# Replace the IDs so they get detokenized right
replacement = f" {TEXT_REPR_BBOX_OPEN}{top}, {left}, {bottom}, {right}{TEXT_REPR_BBOX_CLOSE}"
replacement = self.tokenizer.tokenize(replacement)[1:]
replacement = self.tokenizer.convert_tokens_to_ids(replacement)
replacement = mindspore.tensor(replacement).to(tokens.dtype)

tokens = ops.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
return tokens

def tokens_to_points(tokens, original_size):
while (pair := find_delimiters_pair(tokens, TOKEN_POINT_OPEN_STRING, TOKEN_POINT_CLOSE_STRING)) != (
None,
None,
):
start, end = pair
if end != start + 3:
continue

# Retrieve transformed coordinates from tokens
coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])

# Scale back to original image size and multiply by 2
scale = scale_factor_to_fit(original_size)
x, y = [2 * int(float(c) / scale) for c in coords]

# Replace the IDs so they get detokenized right
replacement = f" {TEXT_REPR_POINT_OPEN}{x}, {y}{TEXT_REPR_POINT_CLOSE}"
replacement = self.tokenizer.tokenize(replacement)[1:]
replacement = self.tokenizer.convert_tokens_to_ids(replacement)
replacement = mindspore.tensor(replacement).to(tokens.dtype)

tokens = ops.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
return tokens

if target_sizes is None:
target_sizes = ((self.image_processor.size["height"], self.image_processor.size["width"]),) * len(outputs)
elif target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")

if len(outputs) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as output sequences")

results = []
for seq, size in zip(outputs, target_sizes):
seq = tokens_to_boxes(seq, size)
seq = tokens_to_points(seq, size)
results.append(seq)

return results

def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)

def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)

__all__ = ["FuyuProcessor"]

+ 2
- 8
mindnlp/transformers/models/hubert/__init__.py View File

@@ -15,14 +15,8 @@
''' Hubert Model '''

from . import configuration_hubert, modeling_hubert
from .configuration_hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
from .modeling_hubert import (
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
HubertForCTC,
HubertForSequenceClassification,
HubertModel,
HubertPreTrainedModel,
)
from .configuration_hubert import *
from .modeling_hubert import *

__all__ = []
__all__.extend(configuration_hubert.__all__)


+ 19
- 87
mindnlp/transformers/models/hubert/configuration_hubert.py View File

@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Hubert model configuration"""
"""Hubert model configuration"""

import functools
import operator
@@ -20,25 +20,16 @@ import operator
from ...configuration_utils import PretrainedConfig
from ....utils import logging

__all__ = [
'HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP',
'HubertConfig',
]

logger = logging.get_logger(__name__)

HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"facebook/hubert-base-ls960": "https://hf-mirror.com/facebook/hubert-base-ls960/resolve/main/config.json",
# See all Hubert models at https://hf-mirror.com/models?filter=hubert
}


class HubertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`HubertModel`]. It is used to instantiate an
Hubert model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Hubert
[facebook/hubert-base-ls960](https://hf-mirror.com/facebook/hubert-base-ls960) architecture.
[facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
@@ -67,7 +58,7 @@ class HubertConfig(PretrainedConfig):
attention_dropout(`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
final_dropout (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
layerdrop (`float`, *optional*, defaults to 0.1):
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
details.
@@ -150,19 +141,20 @@ class HubertConfig(PretrainedConfig):
Dimensionality of the projection before token mean-pooling for classification.

Example:
```python
>>> from transformers import HubertModel, HubertConfig
...
>>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
>>> configuration = HubertConfig()
...
>>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
>>> model = HubertModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""

```python
>>> from transformers import HubertModel, HubertConfig

>>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
>>> configuration = HubertConfig()

>>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
>>> model = HubertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "hubert"

def __init__(
@@ -207,57 +199,7 @@ class HubertConfig(PretrainedConfig):
eos_token_id=2,
**kwargs,
):
"""
Initializes a new instance of the HubertConfig class.
Args:
vocab_size (int): The size of the vocabulary.
hidden_size (int): The size of the hidden layers.
num_hidden_layers (int): The number of hidden layers.
num_attention_heads (int): The number of attention heads.
intermediate_size (int): The size of the intermediate layers.
hidden_act (str): The activation function for the hidden layers.
hidden_dropout (float): The dropout rate for the hidden layers.
activation_dropout (float): The dropout rate for activations.
attention_dropout (float): The dropout rate for attention mechanisms.
feat_proj_layer_norm (bool): Whether to apply layer normalization to projection features.
feat_proj_dropout (float): The dropout rate for feature projection.
final_dropout (float): The final dropout rate.
layerdrop (float): The layer drop probability.
initializer_range (float): The range for parameter initialization.
layer_norm_eps (float): The epsilon value for layer normalization.
feat_extract_norm (str): The normalization type for feature extraction.
feat_extract_activation (str): The activation function for feature extraction.
conv_dim (tuple): The dimensions for convolutional layers.
conv_stride (tuple): The stride values for convolutional layers.
conv_kernel (tuple): The kernel sizes for convolutional layers.
conv_bias (bool): Whether to use bias in convolutional layers.
num_conv_pos_embeddings (int): The number of positional embeddings for convolutional layers.
num_conv_pos_embedding_groups (int): The number of groups for positional embeddings.
do_stable_layer_norm (bool): Whether to use stable layer normalization.
apply_spec_augment (bool): Whether to apply SpecAugment during training.
mask_time_prob (float): The probability of masking in the time dimension.
mask_time_length (int): The maximum length of time masking.
mask_time_min_masks (int): The minimum number of time masks.
mask_feature_prob (float): The probability of masking in the feature dimension.
mask_feature_length (int): The maximum length of feature masking.
mask_feature_min_masks (int): The minimum number of feature masks.
ctc_loss_reduction (str): The reduction type for CTC loss.
ctc_zero_infinity (bool): Whether to set positive infinity to zero in CTC loss.
use_weighted_layer_sum (bool): Whether to use weighted layer sum for classification.
classifier_proj_size (int): The size of the classifier projection layer.
pad_token_id (int): The token ID for padding.
bos_token_id (int): The token ID for the beginning of sequence.
eos_token_id (int): The token ID for the end of sequence.
Returns:
None
Raises:
ValueError: If the configuration for convolutional layers is incorrect.
"""
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)

self.hidden_size = hidden_size
self.feat_extract_norm = feat_extract_norm
self.feat_extract_activation = feat_extract_activation
@@ -313,16 +255,6 @@ class HubertConfig(PretrainedConfig):

@property
def inputs_to_logits_ratio(self):
"""
Calculates the ratio of inputs to logits based on the convolutional strides in the Hubert configuration.
Args:
self (HubertConfig): The instance of HubertConfig.
Returns:
int: The ratio of inputs to logits calculated as the product of convolutional strides.
Raises:
None.
"""
return functools.reduce(operator.mul, self.conv_stride, 1)

__all__ = ['HubertConfig']

+ 308
- 1111
mindnlp/transformers/models/hubert/modeling_hubert.py
File diff suppressed because it is too large
View File


+ 29
- 0
mindnlp/transformers/models/idefics/__init__.py View File

@@ -0,0 +1,29 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Idefics model."""
from . import configuration_idefics, image_processing_idefics, modeling_idefics, perceiver, processing_idefics, vision
from .configuration_idefics import *
from .image_processing_idefics import *
from .modeling_idefics import *
from .perceiver import *
from .processing_idefics import *
from .vision import *

__all__ = []
__all__.extend(configuration_idefics.__all__)
__all__.extend(image_processing_idefics.__all__)
__all__.extend(modeling_idefics.__all__)
__all__.extend(perceiver.__all__)
__all__.extend(processing_idefics.__all__)
__all__.extend(vision.__all__)

+ 317
- 0
mindnlp/transformers/models/idefics/configuration_idefics.py View File

@@ -0,0 +1,317 @@
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Idefics model configuration"""

from mindnlp.utils import logging
from ...configuration_utils import PretrainedConfig

logger = logging.get_logger(__name__)


class IdeficsVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.

e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
intermediate_size (`int`, *optional*, defaults to 5120):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
patch_size (`int`, *optional*, defaults to 14):
The size (resolution) of each patch.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
image_num_channels (`int`, *optional*, defaults to `3`):
Number of image channels.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
testing).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
"""

model_type = "idefics"
attribute_map = {
"hidden_size": "embed_dim",
}

def __init__(
self,
embed_dim=768,
image_size=224,
intermediate_size=5120,
patch_size=14,
num_hidden_layers=32,
num_attention_heads=16,
num_channels=3,
hidden_act="gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
self.embed_dim = embed_dim
self.image_size = image_size
self.intermediate_size = intermediate_size
self.patch_size = patch_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.layer_norm_eps = layer_norm_eps
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.hidden_act = hidden_act

super().__init__(**kwargs)


class IdeficsPerceiverConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.

e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
use_resampler (`bool`, *optional*, defaults to `False`):
Whether or not to use the resampler
resampler_n_latents (`int`, *optional*, defaults to ):
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
resampler_depth (`int`, *optional*, defaults to 6):
Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
resampler_n_heads (`int`, *optional*, defaults to 16):
Number of heads in each Transformer block (for multi-headed self-attention).
resampler_head_dim (`int`, *optional*, defaults to 96):
Dimensionality of each head projection in the Transformer block.
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
Whether or not to use qk layer norms in perceiver
"""

model_type = "idefics"

def __init__(
self,
use_resampler=False,
resampler_n_latents=64,
resampler_depth=6,
resampler_n_heads=16,
resampler_head_dim=96,
qk_layer_norms_perceiver=False,
**kwargs,
):
self.use_resampler = use_resampler
self.resampler_n_latents = resampler_n_latents
self.resampler_depth = resampler_depth
self.resampler_n_heads = resampler_n_heads
self.resampler_head_dim = resampler_head_dim
self.qk_layer_norms_perceiver = qk_layer_norms_perceiver

super().__init__(**kwargs)


class IdeficsConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Idefics-9B.

e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
additional_vocab_size (`int`, *optional`, defaults to 0):
Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
are always trainable whereas regular vocab tokens can be frozen or not.
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~IdeficsModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
Initialization type for the alphas.
alphas_initializer_range (`float`, *optional*, defaults to 0.0):
The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
Attention.
alpha_type (`str`, *optional*, defaults to `"float"`):
Whether the gating alphas should be vectors or single floats.
rms_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*, defaults to 0)
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1)
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2)
End of stream token id.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
cross_layer_interval (`int`, *optional*, default to 1)
Interval for cross attention (from text to image) layers.
qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
Exceptions to freezing text layers when `freeze_text_layers` is `True`
freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
freeze_vision_layers (`bool`, *optional*, defaults to `True`): Whether to freeze vision layers
freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
vision_config (`IdeficsVisionConfig`, *optional*): Custom vision config or dict
perceiver_config (`IdeficsPerceiverConfig`, *optional*): Custom perceiver config or dict

"""

model_type = "idefics"
is_composition = False

def __init__(
self,
vocab_size=32000,
additional_vocab_size=0,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
dropout=0.0,
hidden_act="silu",
initializer_range=0.02,
alpha_initializer="zeros",
alphas_initializer_range=0.0,
alpha_type="float",
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
cross_layer_interval=1,
qk_layer_norms=False,
freeze_text_layers=True,
freeze_text_module_exceptions=[],
freeze_lm_head=False,
freeze_vision_layers=True,
freeze_vision_module_exceptions=[],
use_resampler=False,
vision_config=None,
perceiver_config=None,
**kwargs,
):
self.vocab_size = vocab_size
self.additional_vocab_size = additional_vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.alpha_initializer = alpha_initializer
self.alphas_initializer_range = alphas_initializer_range
self.alpha_type = alpha_type
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache

self.cross_layer_interval = cross_layer_interval
self.qk_layer_norms = qk_layer_norms
self.freeze_vision_layers = freeze_vision_layers

self.freeze_text_layers = freeze_text_layers
self.freeze_text_module_exceptions = freeze_text_module_exceptions
self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
self.freeze_lm_head = freeze_lm_head

self.use_resampler = use_resampler

if perceiver_config is None:
self.perceiver_config = IdeficsPerceiverConfig()
elif isinstance(perceiver_config, dict):
self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
elif isinstance(perceiver_config, IdeficsPerceiverConfig):
self.perceiver_config = perceiver_config

if vision_config is None:
self.vision_config = IdeficsVisionConfig()
elif isinstance(vision_config, dict):
self.vision_config = IdeficsVisionConfig(**vision_config)
elif isinstance(vision_config, IdeficsVisionConfig):
self.vision_config = vision_config

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

# IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
# PretrainedConfig.from_dict first instantiates the class with the config dict and only then
# updates the config object with `kwargs` from from_pretrained, so during the instantiation
# of this object many attributes have default values and haven't yet been overridden.
# Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.


__all__ = [
"IdeficsConfig",
"IdeficsPerceiverConfig",
"IdeficsVisionConfig"
]

+ 173
- 0
mindnlp/transformers/models/idefics/image_processing_idefics.py View File

@@ -0,0 +1,173 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Idefics."""

from typing import Callable, Dict, List, Optional, Union

from PIL import Image

from mindnlp.utils import TensorType, is_mindspore_available

from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
make_list_of_images,
to_numpy_array,
valid_images,
)

IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]


def convert_to_rgb(image):
# `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
# for transparent images. The call to `alpha_composite` handles this case
if image.mode == "RGB":
return image

image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite


class IdeficsImageProcessor(BaseImageProcessor):
r"""
Constructs a Idefics image processor.

Args:
image_size (`int`, *optional*, defaults to 224):
Resize to image size
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method.
image_num_channels (`int`, *optional*, defaults to 3):
Number of image channels.
"""

model_input_names = ["pixel_values"]

def __init__(
self,
image_size: int = 224,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
image_num_channels: Optional[int] = 3,
**kwargs,
) -> None:
super().__init__(**kwargs)

self.image_size = image_size
self.image_num_channels = image_num_channels
self.image_mean = image_mean
self.image_std = image_std

def preprocess(
self,
images: ImageInput,
image_num_channels: Optional[int] = 3,
image_size: Optional[Dict[str, int]] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
transform: Callable = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.MINDSPORE,
**kwargs,
) -> TensorType:
"""
Preprocess a batch of images.

Args:
images (`ImageInput`):
A list of images to preprocess.
image_size (`int`, *optional*, defaults to `self.image_size`):
Resize to image size
image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
Number of image channels.
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
method. Can be overridden by the `image_std` parameter in the `preprocess` method.
transform (`Callable`, *optional*, defaults to `None`):
A custom transform function that accepts a single image can be passed for training. For example,
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
assumed - and then a preset of inference-specific transforms will be applied to the images

Returns:
a PyTorch tensor of the processed images

"""
image_size = image_size if image_size is not None else self.image_size
image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = (image_size, image_size)

if isinstance(images, list) and len(images) == 0:
return []

images = make_list_of_images(images)

if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)

# For training a user needs to pass their own set of transforms as a Callable.
# For reference this is what was used in the original IDEFICS training:
# transform = transforms.Compose([
# convert_to_rgb,
# transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
# transforms.ToTensor(),
# transforms.Normalize(mean=image_mean, std=image_std),
# ])
if transform is not None:
if not is_mindspore_available():
raise ImportError("To pass in `transform` torch must be installed")
import mindnlp

images = [transform(x) for x in images]
return mindnlp.core.ops.stack(images)

# for inference we do the exact transforms that were used to train IDEFICS
images = [convert_to_rgb(x) for x in images]
# further transforms expect numpy arrays
images = [to_numpy_array(x) for x in images]
images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
images = [self.rescale(image=image, scale=1 / 255) for image in images]
images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]

return images


__all__ = [
"IdeficsImageProcessor"
]

+ 1555
- 0
mindnlp/transformers/models/idefics/modeling_idefics.py View File

@@ -0,0 +1,1555 @@
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics model."""

from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import mindspore
import mindnlp.core.nn.functional as F
from mindnlp.core import get_default_dtype
from mindnlp.core import nn, ops
from mindnlp.utils import logging

from ...modeling_utils import PreTrainedModel
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PretrainedConfig
from ...ms_utils import ALL_LAYERNORM_LAYERS

from .configuration_idefics import IdeficsConfig
from .perceiver import IdeficsPerceiverResampler
from .vision import IdeficsVisionTransformer

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "IdeficsConfig"


@dataclass
class IdeficsBaseModelOutputWithPast(ModelOutput):
"""
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.

If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
hidden_size)` is output.
past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
encoder_sequence_length, embed_size_per_head)`.

Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
input) to speed up sequential decoding.
hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(mindspore.Tensor)`, *optional*):
Tuple of `mindspore.Tensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`.

image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""

last_hidden_state: mindspore.Tensor = None
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None
hidden_states: Optional[Tuple[mindspore.Tensor]] = None
attentions: Optional[Tuple[mindspore.Tensor]] = None
image_hidden_states: Optional[Tuple[mindspore.Tensor]] = None


@dataclass
class IdeficsCausalLMOutputWithPast(ModelOutput):
"""
Base class for Idefics causal language model (or autoregressive) outputs.

Args:
loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
image_hidden_states (`tuple(mindspore.Tensor)`, *optional*):
Tuple of `mindspore.Tensor` (one for the output of the image embeddings, `(batch_size, num_images,
sequence_length, hidden_size)`.

image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
"""

loss: Optional[mindspore.Tensor] = None
logits: mindspore.Tensor = None
past_key_values: Optional[List[mindspore.Tensor]] = None
hidden_states: Optional[Tuple[mindspore.Tensor]] = None
attentions: Optional[Tuple[mindspore.Tensor]] = None
image_hidden_states: Optional[Tuple[mindspore.Tensor]] = None


def expand_inputs_for_generation(
input_ids,
expand_size=1,
is_encoder_decoder=False,
attention_mask=None,
encoder_outputs=None,
**model_kwargs,
):
expanded_return_idx = (
# ops.arange(input_ids.shape[0]).view(-1, 1).repeat((1, expand_size)).view(-1)
ops.tile(ops.arange(input_ids.shape[0]).view(-1, 1), (1, expand_size)).view(-1)
)
input_ids = input_ids.index_select(0, expanded_return_idx)
model_kwargs["pixel_values"] = model_kwargs.get("pixel_values", None)
model_kwargs["image_encoder_embeddings"] = model_kwargs.get("image_encoder_embeddings", None)
model_kwargs["perceiver_embeddings"] = model_kwargs.get("perceiver_embeddings", None)
model_kwargs["image_attention_mask"] = model_kwargs.get("image_attention_mask", None)

if "token_type_ids" in model_kwargs:
token_type_ids = model_kwargs["token_type_ids"]
model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)

if attention_mask is not None:
model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)

if model_kwargs["image_attention_mask"] is not None:
model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
0, expanded_return_idx
)

if model_kwargs["pixel_values"] is not None:
model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)

elif model_kwargs["image_encoder_embeddings"] is not None:
model_kwargs["image_encoder_embeddings"] = model_kwargs["image_encoder_embeddings"].index_select(
0, expanded_return_idx
)

elif model_kwargs["perceiver_embeddings"] is not None:
model_kwargs["perceiver_embeddings"] = model_kwargs["perceiver_embeddings"].index_select(
0, expanded_return_idx
)

return input_ids, model_kwargs


def prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs):
token_type_ids = kwargs.get("token_type_ids", None)
# only last token for inputs_ids if past is defined in kwargs
if past_key_values:
input_ids = input_ids[:, -1].unsqueeze(-1)
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)

if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)

pixel_values = kwargs.get("pixel_values", None)
image_encoder_embeddings = kwargs.get("image_encoder_embeddings", None)
perceiver_embeddings = kwargs.get("perceiver_embeddings", None)
image_attention_mask = kwargs.get("image_attention_mask", None)
interpolate_pos_encoding = kwargs.get("interpolate_pos_encoding", False)

return {
"input_ids": input_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
"pixel_values": pixel_values,
"image_encoder_embeddings": image_encoder_embeddings,
"perceiver_embeddings": perceiver_embeddings,
"image_attention_mask": image_attention_mask,
"interpolate_pos_encoding": interpolate_pos_encoding,
}


def freeze_model(model, module_exceptions=[]):
mapping = {
"LayerNorm": nn.LayerNorm,
"Linear": nn.Linear,
"Embedding": nn.Embedding,
}
module_exceptions_mapped = [mapping[m] for m in module_exceptions]

for module in model.modules():
if module_exceptions and any(isinstance(module, t) for t in module_exceptions_mapped):
module.requires_grad_(True) # Explicitely setting it to true to avoid any mistakes
else:
# module.requires_grad_(False)
for p in module.get_parameters():
p.requires_grad = False
return model


class IdeficsDecoupledEmbedding(nn.Embedding):
# Derived from https://pyops.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
"""
Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
then it will create `num_additional_embeddings` additional parameters that are always trained. If
`num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
"""

def __init__(
self,
num_embeddings,
num_additional_embeddings,
embedding_dim,
partially_freeze: Optional[bool] = False,
dtype=None,
padding_idx=None,
**kwargs,
) -> None:
"""
Args:
num_embeddings (`int`):
Size of the dictionary of embeddings
num_additional_embeddings (`int`):
Number of additional embeddings. Only useful when you `partially_freeze=True`.
embedding_dim (`int`):
The size of each embedding vector
partially_freeze: (`bool`, *optional*, defaults to `False`):
If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
padding_idx (`int`, *optional*):
The padding index (needs to be less than num_embeddings)

Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
"""
if padding_idx is not None and padding_idx > num_embeddings:
raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
super().__init__(
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
dtype=dtype,
padding_idx=padding_idx,
**kwargs,
)
self.num_embeddings = num_embeddings
self.padding_idx = padding_idx
self.num_additional_embeddings = num_additional_embeddings
self.partially_freeze = partially_freeze

if partially_freeze:
self.weight.requires_grad = False

if self.num_additional_embeddings > 0:
self.additional_embedding = nn.Embedding(
num_embeddings=self.num_additional_embeddings,
embedding_dim=embedding_dim,
dtype=dtype,
)

def forward(self, input_ids):
"""
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.

in order to make a lookup of the input ids, we:
1. find out the indices of the entries belonging to the 2nd embedding
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
embedding starts from 0 and not num_embeddings
3. perform the 2nd embedding lookup
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
5. perform the 1st embedding lookup
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.

"""
if self.num_additional_embeddings == 0:
return F.embedding(input_ids, self.weight)

# copy so that we don't modify the original input_ids later on
input_ids = input_ids.copy()
additional_vocab_indices = ops.where(input_ids >= self.num_embeddings) # pylint: disable=no-value-for-parameter
input_ids_additional_vocab = input_ids[additional_vocab_indices]
additional_embeddings = self.additional_embedding(input_ids_additional_vocab - self.num_embeddings)

# for successful lookup replace input_ids with 0, the results of these will be discarded anyway
input_ids[additional_vocab_indices] = 0
full_vector = F.embedding(input_ids, self.weight)

# overwrite the records with high indices
full_vector[additional_vocab_indices] = additional_embeddings

return full_vector

def extra_repr(self) -> str:
return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
self.num_embeddings,
self.num_additional_embeddings,
self.embedding_dim,
self.partially_freeze,
)


class IdeficsDecoupledLinear(nn.Linear):
# Derived from https://pyops.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
"""
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
"""

def __init__(
self,
in_features: int,
out_features: int,
out_additional_features: int = 0,
bias: bool = True,
partially_freeze: bool = True,
dtype=None,
) -> None:
"""
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
"""
super().__init__(in_features, out_features, bias, dtype)
self.out_additional_features = out_additional_features
self.partially_freeze = partially_freeze

self.in_features = in_features
self.out_features = out_features

if partially_freeze:
self.weight.requires_grad_(False)
if bias:
self.bias.requires_grad_(False)

if out_additional_features > 0:
self.additional_fc = nn.Linear(
in_features=in_features,
out_features=out_additional_features,
bias=bias,
dtype=dtype,
)

def forward(self, input: mindspore.Tensor) -> mindspore.Tensor:
output = F.linear(input, self.weight, self.bias)

if self.out_additional_features > 0:
additional_features = self.additional_fc(input)
output = ops.cat((output, additional_features), -1)

return output

def extra_repr(self) -> str:
"""Overwriting `nn.Linear.extra_repr` to include new parameters."""
return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
self.in_features,
self.out_features,
self.out_additional_features,
self.bias is not None,
self.partially_freeze,
)


# this was adapted from LlamaRMSNorm
class IdeficsRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
IdeficsRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(ops.ones(hidden_size))
self.variance_epsilon = eps

def forward(self, hidden_states):
variance = hidden_states.to(mindspore.float32).pow(2).mean(-1, keep_dims=True)
hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)

# convert into half-precision if necessary
if self.weight.dtype in [mindspore.float16, mindspore.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)

return self.weight * hidden_states

def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)


# this was adapted from LlamaRotaryEmbedding
class IdeficsEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000):
super().__init__()

self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (ops.arange(0, self.dim, 2, dtype=mindspore.int64).float() / self.dim))
self.inv_freq = inv_freq

# Build here to make `ops.jit.trace` work.
self._set_cos_sin_cache(
seq_len=max_position_embeddings, dtype=get_default_dtype()
)

def _set_cos_sin_cache(self, seq_len, dtype):
self.max_seq_len_cached = seq_len
t = ops.arange(self.max_seq_len_cached, dtype=mindspore.int64).type_as(self.inv_freq)

freqs = ops.einsum("i,j->ij", t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = ops.cat((freqs, freqs), dim=-1)
self.cos_cached = emb.cos().to(dtype)
self.sin_cached = emb.sin().to(dtype)

def forward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)

return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)


def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return ops.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.

Args:
q (`mindspore.Tensor`): The query tensor.
k (`mindspore.Tensor`): The key tensor.
cos (`mindspore.Tensor`): The cosine part of the rotary embedding.
sin (`mindspore.Tensor`): The sine part of the rotary embedding.
position_ids (`mindspore.Tensor`):
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
used to pass offsetted position ids when working with a KV-cache.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""

cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed


# this was adapted from LlamaMLP
class IdeficsMLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
):
super().__init__()
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.act_fn = ACT2FN[hidden_act]

def forward(self, x):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


# this was adapted from LlamaAttention
class IdeficsAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(
self,
hidden_size: int,
num_heads: int,
dropout: float = 0.0,
is_cross_attention: bool = False,
config: PretrainedConfig = None,
qk_layer_norms: bool = False,
):
super().__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.dropout = dropout
self.is_causal = True

if (self.head_dim * num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads})."
)

self.is_cross_attention = is_cross_attention

if not hasattr(nn.functional, "scaled_dot_product_attention"):
raise ValueError("this model requires pytorch 2.0 or higher")

if self.is_cross_attention:
kv_input_dim = (
self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
)
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.k_proj = nn.Linear(kv_input_dim, num_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(
kv_input_dim,
num_heads * self.head_dim,
bias=False,
)
else:
self.q_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.k_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.v_proj = nn.Linear(
self.hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.o_proj = nn.Linear(
num_heads * self.head_dim,
hidden_size,
bias=False,
)
self.rotary_emb = IdeficsEmbedding(self.head_dim)

self.qk_layer_norms = qk_layer_norms
if self.qk_layer_norms:
self.q_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)
self.k_layer_norm = IdeficsRMSNorm(self.head_dim, eps=config.rms_norm_eps)

def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2)

def forward(
self,
hidden_states: mindspore.Tensor,
key_value_states: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
# if key_value_states are provided this layer is used as a cross-attention layer
is_cross_attention = self.is_cross_attention or key_value_states is not None

bsz, q_len, _ = hidden_states.shape

query_states = ops.transpose(self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim), 1, 2)
if not is_cross_attention:
key_states = ops.transpose(self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim), 1, 2)
value_states = ops.transpose(self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim), 1,
2)
else:
_, kv_len, _ = key_value_states.shape # Note that, in this case, `kv_len` == `kv_seq_len`
key_states = ops.transpose(self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim),
1, 2)
value_states = (
ops.transpose(self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim), 1, 2)
)

kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
if not is_cross_attention:
cos, sin = self.rotary_emb(value_states, seq_len=max(kv_seq_len, q_len))
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
# [bsz, nh, t, hd]

if past_key_value is not None:
# reuse k, v, self_attention
key_states = ops.cat([past_key_value[0], key_states], dim=2)
value_states = ops.cat([past_key_value[1], value_states], dim=2)

past_key_value = (key_states, value_states) if use_cache else None

if self.qk_layer_norms:
query_states = self.q_layer_norm(query_states)
key_states = self.k_layer_norm(key_states)

if attention_mask is not None:
if attention_mask.shape != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
)

# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
# Reference: https://github.com/pytorch/pytorch/issues/112577.

# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both ops.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
is_causal = self.is_causal and attention_mask is None and q_len > 1

attn_output = F.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
dropout_p=self.dropout if self.training else 0.0,
is_causal=is_causal,
)

if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.shape}"
)

attn_output = ops.transpose(attn_output, 1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

attn_output = self.o_proj(attn_output)

attn_weights = None
if output_attentions:
logger.warning_once(
"attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
)

return attn_output, attn_weights, past_key_value


# this was adapted from LlamaDecoderLayer
class IdeficsDecoderLayer(nn.Module):
def __init__(self, config: IdeficsConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = IdeficsAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
dropout=config.dropout,
config=config,
)
self.mlp = IdeficsMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
)
self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.dropout = config.dropout

def forward(
self,
hidden_states: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
"""
Args:
hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`mindspore.Tensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(mindspore.Tensor)`, *optional*): cached past key and value projection states
"""

residual = hidden_states

hidden_states = self.input_layernorm(hidden_states)

# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states

# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states

outputs = (hidden_states,)

if output_attentions:
outputs += (self_attn_weights,)

if use_cache:
outputs += (present_key_value,)

return outputs


class IdeficsGatedCrossAttentionLayer(nn.Module):
def __init__(self, config: IdeficsConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.cross_attn = IdeficsAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
is_cross_attention=True,
dropout=config.dropout,
config=config,
qk_layer_norms=config.qk_layer_norms,
)
self.mlp = IdeficsMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
)
self.input_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.config = config.dropout

self.act_cross_attn = nn.Tanh()
self.act_dense = nn.Tanh()

if config.alpha_initializer == "zeros":
if config.alpha_type == "vector":
self.alpha_cross_attn = nn.Parameter(ops.zeros(1, 1, self.hidden_size))
self.alpha_dense = nn.Parameter(ops.zeros(1, 1, self.hidden_size))
elif config.alpha_type == "float":
self.alpha_cross_attn = nn.Parameter(ops.zeros(1))
self.alpha_dense = nn.Parameter(ops.zeros(1))
else:
raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")

elif config.alpha_initializer == "ones":
if config.alpha_type == "vector":
self.alpha_cross_attn = nn.Parameter(ops.ones(1, 1, self.hidden_size))
self.alpha_dense = nn.Parameter(ops.ones(1, 1, self.hidden_size))
elif config.alpha_type == "float":
self.alpha_cross_attn = nn.Parameter(ops.ones(1))
self.alpha_dense = nn.Parameter(ops.ones(1))
else:
raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")

elif config.alpha_initializer in {"normal", "gaussian", "random"}:
if config.alpha_type == "vector":
self.alpha_cross_attn = nn.Parameter(
ops.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
)
self.alpha_dense = nn.Parameter(
ops.normal(mean=0.0, std=config.alphas_initializer_range, size=(1, 1, self.hidden_size))
)
elif config.alpha_type == "float":
self.alpha_cross_attn = nn.Parameter(
ops.normal(mean=0.0, std=config.alphas_initializer_range, size=(1))
)
self.alpha_dense = nn.Parameter(ops.normal(mean=0.0, std=config.alphas_initializer_range, size=(1)))
else:
raise ValueError(f"Unknown value for `alpha_type` ({config.alpha_type})")

else:
raise NotImplementedError(f"Alpha initialization scheme {config.alpha_initializer} not yet implemented!")

if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
raise ValueError("Alpha parameters not initialized correctly!")

def forward(
self,
hidden_states: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
image_hidden_states: Optional[mindspore.Tensor] = None,
image_attention_mask: Optional[mindspore.Tensor] = None,
cross_attention_gate: Optional[mindspore.Tensor] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
"""
Args:
hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`mindspore.Tensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
image_attention_mask (`mindspore.Tensor`, *optional*): image attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
cross_attention_gate (`mindspore.Tensor`, *optional*):
gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(mindspore.Tensor)`, *optional*): cached past key and value projection states
"""
if image_hidden_states is None:
raise ValueError(
"`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
" conditioned on."
)

if cross_attention_gate is None:
raise ValueError(
"`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images."
)

if past_key_value is not None:
raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")

residual = hidden_states

hidden_states = self.input_layernorm(hidden_states)

# Self Attention
hidden_states, self_attn_weights, present_key_value = self.cross_attn(
hidden_states=hidden_states,
key_value_states=image_hidden_states,
attention_mask=image_attention_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
# Fill in zeros for cross_attention hidden_states of tokens attending to no images
hidden_states[cross_attention_gate == 0] = hidden_states[cross_attention_gate == 0].fill(0)
hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states

# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states

outputs = (hidden_states,)

if output_attentions:
outputs += (self_attn_weights,)

if use_cache:
outputs += (present_key_value,)

return outputs


LLAMA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

This model is also a PyTorch [nn.Module](https://pyops.org/docs/stable/nn.html#nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

Parameters:
config ([`IdeficsConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


class IdeficsPreTrainedModel(PreTrainedModel):
config_class = IdeficsConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
_supports_sdpa = True

def _init_weights(self, module):
# important: this ported version of Idefics isn't meant for training from scratch - only
# inference and fine-tuning - so the proper init weights code has been removed - the m4 code
# base should be used for training from scratch and it contains the correct code.
std = self.config.initializer_range
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, mean=0.0, std=std)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, mean=0.0, std=std)
if module.padding_idx:
module.weight.data[module.padding_idx] = 0

# Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
@classmethod
def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
# We remove the checks on `is_torch_sdpa_available()` and `cls._supports_sdpa` as Falcon supports SDPA from torch==2.0.0 (no requirement on 2.1).
_is_bettertransformer = getattr(cls, "use_bettertransformer", False)
if _is_bettertransformer:
return config

if not hard_check_only:
config._attn_implementation = "sdpa"
return config


LLAMA_INPUTS_DOCSTRING = r"""
Args:
input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.

Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.

[What are input IDs?](../glossary#input-ids)
attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

[What are attention masks?](../glossary#attention-mask)

Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.

If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).

If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.

- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
past_key_values (`tuple(tuple(mindspore.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(mindspore.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class IdeficsModel(IdeficsPreTrainedModel):
"""
Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

Args:
config: IdeficsConfig
"""

def __init__(self, config: IdeficsConfig):
super().__init__(config)
self.config = config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size

self.embed_tokens = IdeficsDecoupledEmbedding(
num_embeddings=config.vocab_size,
num_additional_embeddings=config.additional_vocab_size,
embedding_dim=config.hidden_size,
partially_freeze=config.freeze_text_layers,
padding_idx=self.padding_idx,
)

self.image_size = config.vision_config.image_size
self.vision_config = config.vision_config
self.vision_model = IdeficsVisionTransformer(config.vision_config)

# Perceiver Resampler
if config.use_resampler:
perceiver_config = config.perceiver_config
self.perceiver_resampler = IdeficsPerceiverResampler(
config,
config.vision_config.embed_dim,
perceiver_config.resampler_depth,
perceiver_config.resampler_n_heads,
perceiver_config.resampler_head_dim,
perceiver_config.resampler_n_latents,
)

self.layers = nn.ModuleList([IdeficsDecoderLayer(config) for _ in range(config.num_hidden_layers)])

self.cross_layer_interval = config.cross_layer_interval
num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
self.gated_cross_attn_layers = nn.ModuleList(
[IdeficsGatedCrossAttentionLayer(config) for _ in range(num_cross_layers)]
)
self.gradient_checkpointing = False

self.norm = IdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

# Initialize weights and apply final processing
self.post_init()

self.freeze_relevant_params(config)

def freeze_relevant_params(self, config=None):
if config is None:
config = self.config

if config.freeze_text_layers:
self.freeze_text_layers(config.freeze_text_module_exceptions)

if config.freeze_vision_layers:
freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)

def freeze_text_layers(self, module_exceptions=[]):
for module in [self.layers, self.norm]:
freeze_model(module, module_exceptions=module_exceptions)

def freeze_vision_layers(self, module_exceptions=[]):
freeze_model(self.vision_model, module_exceptions=module_exceptions)

def get_input_embeddings(self):
return self.embed_tokens

def set_input_embeddings(self, value):
self.embed_tokens = value

def forward(
self,
input_ids: mindspore.Tensor = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_values: Optional[List[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
pixel_values: Optional[mindspore.Tensor] = None,
image_encoder_embeddings: Optional[mindspore.Tensor] = None,
perceiver_embeddings: Optional[mindspore.Tensor] = None,
image_attention_mask: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, IdeficsBaseModelOutputWithPast]:

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache

return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

seq_length_with_past = seq_length
past_key_values_length = 0

if past_key_values is not None:
past_key_values_length = past_key_values[0][0].shape[2]
seq_length_with_past = seq_length_with_past + past_key_values_length

if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill(attention_mask == 0, 1)
elif position_ids is None:
position_ids = ops.arange(
past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
)
position_ids = position_ids.unsqueeze(0)

if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2:
raise ValueError(
"Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
)

elif pixel_values is not None:
pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility
batch_size, num_images = pixel_values.shape[:2]
pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])

# Get sequence from the vision encoder

image_hidden_states = self.vision_model(
pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
).last_hidden_state

elif image_encoder_embeddings is not None:
batch_size, num_images, image_seq_len, image_hidden_size = image_encoder_embeddings.shape
image_hidden_states = image_encoder_embeddings.to(dtype=self.dtype)
image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)

if self.config.use_resampler:
if perceiver_embeddings is None:
perceiver_embeddings = self.perceiver_resampler(image_hidden_states)
image_seq_len, image_hidden_size = perceiver_embeddings.shape[1], perceiver_embeddings.shape[2]
else:
batch_size, num_images, image_seq_len, image_hidden_size = perceiver_embeddings.shape
image_hidden_states = perceiver_embeddings
elif perceiver_embeddings is None:
image_seq_len, image_hidden_size = image_hidden_states.shape[1], image_hidden_states.shape[2]
else:
raise ValueError("If `perceiver_embeddings` are passed, use_resampler should be True")

image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
# # Hack to use the model in full language modeling mode
# image_attention_mask = ops.zeros(batch_size, seq_length, 1, dtype=mindspore.int64, device=image_hidden_states.device)
# Make image_attention_mask compatible with hidden states

text_seq_len = image_attention_mask.shape[1]
image_attention_mask = image_attention_mask.unsqueeze(-1)
# image_attention_mask = image_attention_mask.repeat((1, 1, 1, image_seq_len))
image_attention_mask = ops.tile(image_attention_mask, (1, 1, 1, image_seq_len))
image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)

if image_hidden_states is not None:
image_batch_size, image_sequence_length, _ = image_hidden_states.shape
image_hidden_shape = (image_batch_size, image_sequence_length)
if image_attention_mask is None:
image_attention_mask = ops.ones(image_hidden_shape)
image_attention_mask = self.invert_attention_mask(image_attention_mask)
else:
image_attention_mask = None

# cross_attention_gate:
# For any tokens attending to no images, the hidden_states comming out of the cross-attention should be zeroed-out.
# `image_attention_mask` has shape [bsz, 1, num_images, hidden_size] with elements equal to either 0.0 or a very negative number.
# If any of the elements are 0.0, then the token is attending to at least one image and the gate value is 1. Otherwise the gate value is 0.
# `cross_attention_gate` has shape [bsz, seq_len] with elements equal to either 0.0 or 1.0.
cross_attention_gate = ((((image_attention_mask == 0.0).any(axis=-1)).to(dtype=self.dtype)).squeeze(axis=1))

if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
# embed positions
if attention_mask is None:
attention_mask = ops.ones(
(batch_size, seq_length_with_past), dtype=mindspore.bool_
)
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
)

hidden_states = inputs_embeds

if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False

# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = () if use_cache else None

for idx, decoder_layer in enumerate(self.layers):
if output_hidden_states:
all_hidden_states += (hidden_states,)

past_key_value = past_key_values[idx] if past_key_values is not None else None

def vblock(
main_block,
hidden_states,
attention_mask,
position_ids,
past_key_value,
image_hidden_states,
image_attention_mask,
cross_attention_gate,
output_attentions,
use_cache,
layer_idx,
cross_layer_interval,
gated_cross_attn_layers,
):
# TODO(ls): Add cross attention values to respective lists
if layer_idx % cross_layer_interval == 0:
xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
outputs = xblock(
hidden_states,
attention_mask=attention_mask,
image_hidden_states=image_hidden_states,
image_attention_mask=image_attention_mask,
cross_attention_gate=cross_attention_gate,
output_attentions=output_attentions,
use_cache=use_cache,
past_key_value=None, # not implemented
)
hidden_states = outputs[0]

layer_outputs = main_block(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)

return layer_outputs

if self.gradient_checkpointing and self.training:
past_key_value = None
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False

layer_outputs = self._gradient_checkpointing_func(
vblock,
decoder_layer,
hidden_states,
attention_mask,
position_ids,
past_key_value,
image_hidden_states,
image_attention_mask,
cross_attention_gate,
output_attentions,
use_cache,
idx,
self.cross_layer_interval,
self.gated_cross_attn_layers,
)
else:
layer_outputs = vblock(
decoder_layer,
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
image_hidden_states=image_hidden_states,
image_attention_mask=image_attention_mask,
cross_attention_gate=cross_attention_gate,
output_attentions=output_attentions,
use_cache=use_cache,
layer_idx=idx,
cross_layer_interval=self.cross_layer_interval,
gated_cross_attn_layers=self.gated_cross_attn_layers,
)

hidden_states = layer_outputs[0]

if use_cache:
next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

if output_attentions:
all_self_attns += (layer_outputs[1],)

hidden_states = self.norm(hidden_states)

# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)

next_cache = next_decoder_cache if use_cache else None
image_hidden_states = image_hidden_states.view(batch_size, num_images, image_seq_len, image_hidden_size)
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, image_hidden_states]
if v is not None
)
return IdeficsBaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
image_hidden_states=image_hidden_states,
)


class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]

def __init__(self, config, vision_model=None):
super().__init__(config)
self.model = IdeficsModel(config)

self.lm_head = IdeficsDecoupledLinear(
in_features=config.hidden_size,
out_features=config.vocab_size,
out_additional_features=config.additional_vocab_size,
bias=False,
partially_freeze=config.freeze_lm_head,
)

# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self):
return self.model.embed_tokens

def set_input_embeddings(self, value):
self.model.embed_tokens = value

def get_output_embeddings(self):
return self.lm_head

def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings

def set_decoder(self, decoder):
self.model = decoder

def get_decoder(self):
return self.model

def tie_weights(self):
"""
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
"""
output_embeddings = self.get_output_embeddings()
input_embeddings = self.get_input_embeddings()

if getattr(self.config, "tie_word_embeddings", True):
output_embeddings.weight = input_embeddings.weight
if input_embeddings.num_additional_embeddings > 0:
assert output_embeddings.out_additional_features == input_embeddings.num_additional_embeddings
output_embeddings.additional_fc.weight = input_embeddings.additional_embedding.weight

if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings
if hasattr(output_embeddings, "out_additional_features") and hasattr(
input_embeddings, "num_additional_embeddings"
):
output_embeddings.out_additional_features = input_embeddings.num_additional_embeddings

def forward(
self,
input_ids: mindspore.Tensor = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_values: Optional[List[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
pixel_values: Optional[mindspore.Tensor] = None,
image_encoder_embeddings: Optional[mindspore.Tensor] = None,
perceiver_embeddings: Optional[mindspore.Tensor] = None,
image_attention_mask: Optional[mindspore.Tensor] = None,
labels: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, IdeficsCausalLMOutputWithPast]:
r"""
Args:
labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Returns:

"""

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
pixel_values=pixel_values,
image_encoder_embeddings=image_encoder_embeddings,
perceiver_embeddings=perceiver_embeddings,
image_attention_mask=image_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)

hidden_states = outputs[0]
logits = self.lm_head(hidden_states)

loss = None
if labels is not None:
# Shift so that tokens < n predict n
if attention_mask is not None:
shift_attention_mask = attention_mask[..., 1:]
shift_logits = logits[..., :-1, :][shift_attention_mask != 0]
shift_labels = labels[..., 1:][shift_attention_mask != 0]
else:
shift_logits = logits[..., :-1, :]
shift_labels = labels[..., 1:]
# Flatten the tokens
loss = F.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output

return IdeficsCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=outputs.image_hidden_states,
)

def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
image_hidden_states = kwargs.pop("image_hidden_states", None)
if image_hidden_states is not None:
if self.config.use_resampler:
kwargs["perceiver_embeddings"] = image_hidden_states
else:
kwargs["image_encoder_embeddings"] = image_hidden_states
kwargs["pixel_values"] = None
inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
unwanted_kwargs = ["token_type_ids"]
for kwarg in unwanted_kwargs:
inputs.pop(kwarg, None)
return inputs

@staticmethod
def _expand_inputs_for_generation(
*args,
**model_kwargs,
):
return expand_inputs_for_generation(*args, **model_kwargs)

def _update_model_kwargs_for_generation(
self,
outputs: ModelOutput,
model_kwargs: Dict[str, Any],
is_encoder_decoder: bool = False,
standardize_cache_format: bool = False,
) -> Dict[str, Any]:
model_kwargs = super()._update_model_kwargs_for_generation(
outputs,
model_kwargs,
is_encoder_decoder,
standardize_cache_format,
)

if "image_attention_mask" in model_kwargs:
image_attention_mask = model_kwargs["image_attention_mask"]
last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
model_kwargs["image_attention_mask"] = last_mask

# Get the precomputed image_hidden_states
model_kwargs["image_hidden_states"] = outputs.image_hidden_states
return model_kwargs

@staticmethod
def _reorder_cache(past, beam_idx):
reordered_past = ()
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past


__all__ = [
"IdeficsForVisionText2Text",
"IdeficsModel",
"IdeficsPreTrainedModel",
]

+ 196
- 0
mindnlp/transformers/models/idefics/perceiver.py View File

@@ -0,0 +1,196 @@
# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
#
# MIT License
#
# Copyright (c) 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


"""

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
- DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
- Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

"""

from typing import Optional, Tuple

import mindspore
from mindnlp.core import nn, ops

from .configuration_idefics import IdeficsConfig


class IdeficsPerceiverResampler(nn.Module):
def __init__(
self, config: IdeficsConfig, embed_dim: int, depth: int, n_heads: int, head_dim: int, n_latents: int
) -> None:
"""
Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

Args:
config (`IdeficsConfig`): config object
embed_dim (`int`): The size of each embedding vector
depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
head_dim (`int`): Dimensionality of each head projection in the Transformer block.
n_latents (`int`):
Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

"""
super().__init__()
self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver

# Create Latents for Perceiver
self.latents = nn.Parameter(ops.randn(self.n_latents, self.embed_dim), requires_grad=True)

self.intermediate_dim = (
self.embed_dim * 4
if not hasattr(config.vision_config, "embed_dim")
else config.vision_config.embed_dim * 4
)
# Create Transformer Blocks
self.blocks = nn.ModuleList(
[
nn.ModuleList(
[
IdeficsPerceiverAttention(self.embed_dim, self.n_heads, self.head_dim, self.qk_layer_norms),
IdeficsMLP(self.intermediate_dim, config),
]
)
for _ in range(depth)
]
)
self.layer_norm = nn.LayerNorm(self.embed_dim)

def forward(self, context: mindspore.Tensor) -> mindspore.Tensor:
"""Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
# einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
# latents = self.latents.repeat((context.shape[0], 1, 1))
latents = ops.tile(self.latents, (context.shape[0], 1, 1))
# Feed through Perceiver Attention blocks...
for attn, ff in self.blocks:
latents = attn(context, latents) + latents
latents = ff(latents) + latents

return self.layer_norm(latents)


class IdeficsPerceiverAttention(nn.Module):
def __init__(self, embed_dim: int, n_heads: int, head_dim: int, qk_layer_norms: bool) -> None:
"""Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
super().__init__()
self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
self.qk_layer_norms = qk_layer_norms
# Normalization & Scaling
self.context_layer_norm = nn.LayerNorm(self.embed_dim)
self.latents_layer_norm = nn.LayerNorm(self.embed_dim)
if self.qk_layer_norms:
self.q_layer_norm = nn.LayerNorm(self.head_dim)
self.k_layer_norm = nn.LayerNorm(self.head_dim)

self.qk_scale = self.head_dim ** -0.5

# Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.head_dim, bias=False)

self.output_proj = nn.Linear(self.n_heads * self.head_dim, embed_dim, bias=False)

def forward(self, context: mindspore.Tensor, latents: mindspore.Tensor) -> mindspore.Tensor:
"""
Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

Args:
context (`mindspore.Tensor`):
Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
latents (`mindspore.Tensor`):
Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

Returns:
`mindspore.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
from context.
"""
context = self.context_layer_norm(context)
latents = self.latents_layer_norm(latents)
batch_size, seq_length, embed_dim = context.shape[:3]

# Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
# Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
q = self.q_proj(latents)
k = self.k_proj(ops.cat([context, latents], dim=-2))
v = self.v_proj(ops.cat([context, latents], dim=-2))

# Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
# =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
# einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
q, k, v = [ops.transpose(x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim),1, 2) for x in (q, k, v)]

if self.qk_layer_norms:
q = self.q_layer_norm(q)
k = self.k_layer_norm(k)

scores = ops.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
attn = stabilized_scores.softmax(dim=-1)

# Attend & project back to output...
resampled = ops.einsum("... i j, ... j d -> ... i d", attn, v)
# einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
return self.output_proj(ops.transpose(resampled,1, 2).flatten(-2))


class IdeficsMLP(nn.Module):
def __init__(self, intermediate_size, config: IdeficsConfig):
"""Simple MLP block with intermediate_size and embedding size"""
super().__init__()
self.embed_dim = config.vision_config.embed_dim
self.ln = nn.LayerNorm(self.embed_dim)
self.fc = nn.Linear(self.embed_dim, intermediate_size, bias=False)
self.act = nn.ReLU()
self.c_proj = nn.Linear(intermediate_size, self.embed_dim, bias=False)

def forward(self, hidden_states: Optional[Tuple[mindspore.Tensor]]) -> mindspore.Tensor:
hidden_states = self.ln(hidden_states)
hidden_states = self.fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)

return hidden_states


__all__ = [
"IdeficsPerceiverResampler",
"IdeficsPerceiverAttention",
"IdeficsMLP",
]

+ 429
- 0
mindnlp/transformers/models/idefics/processing_idefics.py View File

@@ -0,0 +1,429 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for IDEFICS.
"""

from typing import Callable, List, Optional, Union
from urllib.parse import urlparse

from mindnlp.utils import is_mindspore_available
from mindnlp.core import ops
from mindnlp.core.nn import functional as F

from ...feature_extraction_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy

if is_mindspore_available():
import mindspore

IMAGE_TOKEN = "<image>"


# copied from m4.training.packing
def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1):
# Set elements >= num_classes to -1
if num_classes != -1:
if return_tensors == "ms":
incremental_mask[incremental_mask >= num_classes] = -1

# Create mask for negative values
if return_tensors == "ms":
negatives = incremental_mask == -1
incremental_mask[negatives] = 0

# attn_mask = ops.one_hot(incremental_mask, num_classes=num_classes)
# attn_mask = mindspore.ops.one_hot(incremental_mask, depth=num_classes)
attn_mask = F.one_hot(incremental_mask, num_classes=num_classes)
attn_mask[negatives, :] = 0

return attn_mask


# copied from m4.training.packing
def image_attention_mask_for_packed_input_ids(input_ids, tokenizer, return_tensors):
if return_tensors == "ms":
return image_attention_mask_for_packed_input_ids_pt(input_ids, tokenizer)


def image_attention_mask_for_packed_input_ids_pt(input_ids, tokenizer):
image_attention_mask = ops.full_like(input_ids, fill_value=-1)
next_image_attention_mask = ops.full_like(input_ids, fill_value=-1)
image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
eod_token_id = tokenizer.eos_token_id
for batch_idx in range(input_ids.shape[0]):
count = -1
seen_eod = False
for idx, token_id in enumerate(input_ids[batch_idx]):
if token_id == image_token_id:
count += 1
image_attention_mask[batch_idx][idx] = count
seen_eod = False
else:
image_attention_mask[batch_idx][idx] = count

if seen_eod:
image_attention_mask[batch_idx][idx] = -1

if token_id == eod_token_id:
seen_eod = True

for batch_idx in range(input_ids.shape[0]):
count = -1
seen_eod = False
for idx in range(input_ids[batch_idx].shape[0] - 1, -1, -1):
token_id = input_ids[batch_idx][idx]
if token_id == image_token_id:
count += 1
next_image_attention_mask[batch_idx][idx] = count
seen_eod = False
else:
next_image_attention_mask[batch_idx][idx] = count

if token_id == eod_token_id:
seen_eod = True

if seen_eod:
next_image_attention_mask[batch_idx][idx] = -1

non_negative_indices = next_image_attention_mask[batch_idx] != -1
next_image_attention_mask[batch_idx][non_negative_indices] -= count
next_image_attention_mask[batch_idx][non_negative_indices] *= -1

return image_attention_mask, next_image_attention_mask


def is_url(string):
"""Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
invalidated the url"""
if " " in string:
return False
result = urlparse(string)
return all([result.scheme, result.netloc])


class IdeficsProcessor(ProcessorMixin):
r"""
Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

[`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

Args:
image_processor (`IdeficsImageProcessor`):
An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
tokenizer (`LlamaTokenizerFast`):
An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
"""

attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["image_size", "add_end_of_utterance_token"]
image_processor_class = "IdeficsImageProcessor"
tokenizer_class = "LlamaTokenizerFast"

def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")

super().__init__(image_processor, tokenizer)
self.current_processor = self.image_processor
self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)

self.default_image_dims = (
self.image_processor.image_num_channels,
self.image_processor.image_size,
self.image_processor.image_size,
)

self.tokenizer_was_trained_with_end_of_utterance_token = (
"<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
)

def __call__(
self,
prompts: Union[List[TextInput], List[List[TextInput]]],
padding: Union[bool, str, PaddingStrategy] = "longest",
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
transform: Callable = None,
add_eos_token=False,
add_end_of_utterance_token=None,
debug=False,
return_tensors="ms",
) -> BatchEncoding:
"""This method takes batched or non-batched prompts made of text and images and converts them into prompts that
the model was trained on and prepares the image pixel values for the model to process.

Args:
prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
either a single prompt or a batched list of prompts - see the detailed description immediately after
the end of the arguments doc section.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
lengths.
Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
transform (`Callable`, *optional*):
A custom transform function that accepts a single image can be passed for training. For example,
`torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
set of transforms will be applied to the images
add_eos_token (`bool`, *optional*, defaults to `False`):
Adds `eos_token` at the end of the final prompt if True`
add_end_of_utterance_token (`bool`, *optional*)
Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
image). If `None` the tokenizer will be checked instead and if this token is found in
`additional_special_tokens` then the value will be `True`.
debug (`bool`, *optional*, defaults to `False`):
`True` value will help debug prompt generation by dumping useful information
return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
The type of tensors to return. Can be one of:
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.

Returns:
a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
directly passed to `model.generate`

Detailed explanation:

Each entry in `prompts` is either a text to be passed as is or an image that will be processed.

An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
entry into the prompt.

Example:

```python
checkpoint = "HuggingFaceM4/idefics-9b"
processor = AutoProcessor.from_pretrained(checkpoint)
url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
img = processor.image_processor.fetch_images([url])[0]

prompts = [
"User:",
img,
"Describe this image.\nAssistant: An image of two kittens in grass.\n",
"User:",
"https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
"Describe this image.\nAssistant:",
]

inputs = processor(prompts, return_tensors="ms")
generated_ids = model.generate(**inputs, max_length=100)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
```

In this example the `prompts` will be converted into:

```
<s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
Assistant: An image of two kittens in grass.
User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
Assistant:'
```

and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
`pixel_values` dict entry of the return value.

This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
first image is passed as object and the second one as a url.

To do training do:

```python
image_transform = transforms.Compose(
[
transforms.RandomResizedCrop(
(w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
),
transforms.ToTensor(),
transforms.Normalize(mean=self.image_mean, std=self.image_std),
]
)
inputs = processor(prompts, transform=image_transform, return_tensors="ms")
```

In order to help debug prompt generation enable `debug=True` which will show you what's happening.

"""

# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
if add_end_of_utterance_token is None:
add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
# turn non-batched prompts into batched
if not any(isinstance(i, list) for i in prompts):
prompts = [prompts]

fake_token = "<fake_token_around_image>"
image_token = "<image>"
end_of_utterance_token = "<end_of_utterance>"

def image_tokens(last_was_image):
if last_was_image:
return image_token + fake_token
else:
return fake_token + image_token + fake_token

all_prompts = []
all_images = []
for sample in prompts:
# the model was trained on samples starting with <s>
full_text = f"{self.tokenizer.bos_token}"

# an image can either be an image object in the item or the url, everything else is a verbatim prompt text
image_objects = []
last_was_image = False
last_was_text = False
for i, item in enumerate(sample):
if i > 0:
last_was_text = not last_was_image

if isinstance(item, str):
item = item.strip(" ")
if is_url(item):
image = self.image_processor.fetch_images(item)
full_text += image_tokens(last_was_image)
image_objects.append(image)
last_was_image = True
else:
# we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
if add_end_of_utterance_token and last_was_text:
full_text += end_of_utterance_token
full_text += item
last_was_image = False
else:
# must be an image obj
full_text += image_tokens(last_was_image)
image_objects.append(item)
last_was_image = True

if add_eos_token:
full_text += self.tokenizer.eos_token

if debug is True:
print(f"{full_text=}")

image_objects = self.image_processor(image_objects, transform=transform, return_tensors=return_tensors)

all_prompts.append(full_text)
all_images.append(image_objects)

text_encoding = self.tokenizer(
text=all_prompts,
add_special_tokens=False,
padding=padding,
truncation=truncation,
max_length=max_length,
)
all_texts = text_encoding["input_ids"]
all_attention_masks = text_encoding["attention_mask"]

# max_num_images has to be at least 1 even when there are no images
max_num_images = max(len(x) for x in all_images)
max_num_images = max(1, max_num_images)

at_least_one_image = sum(len(x) for x in all_images) > 0
output_input_ids = []
output_images = []
output_attention_masks = []

for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images):
padded_input_ids = text
image_count = padded_input_ids.count(self.image_token_id)
local_max_num_images = min(image_count, max_num_images)

current_images = images[:local_max_num_images]

if len(current_images) > 0:
if return_tensors == "ms":
padded_image_tensor = ops.zeros(max_num_images, *current_images.shape[1:])
padded_image_tensor[: current_images.shape[0]] = current_images

else:
if return_tensors == "ms":
padded_image_tensor = ops.zeros(max_num_images, *self.default_image_dims)

output_images.append(padded_image_tensor)
if return_tensors == "ms":
output_input_ids.append(mindspore.tensor(padded_input_ids))
output_attention_masks.append(mindspore.tensor(attention_mask))

if return_tensors == "ms":
output_input_ids = ops.stack(output_input_ids)
output_images = ops.stack(output_images)
output_attention_masks = ops.stack(output_attention_masks)

if at_least_one_image:
image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
output_input_ids, self.tokenizer, return_tensors
)
image_attention_mask = incremental_to_binary_attention_mask(
image_attention_mask, return_tensors, num_classes=max_num_images
)
else:
# in full language mode we set the image mask to all-0s
if return_tensors == "ms":
image_attention_mask = ops.zeros(
output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=mindspore.bool_
)

return BatchFeature(
data={
"input_ids": output_input_ids,
"attention_mask": output_attention_masks,
"pixel_values": output_images,
"image_attention_mask": image_attention_mask,
}
)

def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)

def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)

@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


__all__ = [
"IdeficsProcessor"
]

+ 515
- 0
mindnlp/transformers/models/idefics/vision.py View File

@@ -0,0 +1,515 @@
# coding=utf-8
# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""

import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import mindspore
from mindspore.dataset.vision import Inter, Resize
import numpy as np
from mindnlp.core import nn, ops
from mindnlp.core.nn import functional as F
from mindnlp.utils import ModelOutput, logging

from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from .configuration_idefics import IdeficsVisionConfig

logger = logging.get_logger(__name__)


@dataclass
class IdeficsVisionModelOutput(ModelOutput):
"""
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
image_embeds (`mindspore.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
The image embeddings obtained by applying the projection layer to the pooler_output.
last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""

image_embeds: Optional[mindspore.Tensor] = None
last_hidden_state: mindspore.Tensor = None
hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
attentions: Optional[Tuple[mindspore.Tensor, ...]] = None


# Adapted from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings
class IdeficsVisionEmbeddings(nn.Module):
def __init__(self, config: IdeficsVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size

self.class_embedding = nn.Parameter(ops.randn(self.embed_dim))

self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
bias=False,
)

self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.position_ids = ops.arange(self.num_positions).broadcast_to((1, -1))

# Heavily inspired from https://github.com/huggingface/transformers/blob/v4.33.0/src/transformers/models/vit/modeling_vit.py#L82
def interpolate_pos_encoding(self, embeddings: mindspore.Tensor, height: int, width: int) -> mindspore.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.

Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
"""

num_patches = embeddings.shape[1] - 1
pos_embed = self.position_embedding(self.position_ids)
num_positions = pos_embed.shape[1] - 1
if num_patches == num_positions and height == width:
return pos_embed
class_pos_embed = pos_embed[:, 0]
patch_pos_embed = pos_embed[:, 1:]

embed_dim = embeddings.shape[-1]
num_h_patches = height // self.config.patch_size
num_w_patches = width // self.config.patch_size
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
num_h_patches, num_w_patches = num_h_patches + 0.1, num_w_patches + 0.1
sqrt_num_positions = math.sqrt(num_positions)
patch_pos_embed = patch_pos_embed.reshape(1, int(sqrt_num_positions), int(sqrt_num_positions), embed_dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
fp32_upcasting = patch_pos_embed.dtype == mindspore.bfloat16
if fp32_upcasting:
logger.warning_once(
"Upcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in F.interpolate "
"is not implemented for 'mindspore.bfloat16' dtype. This will result in a slight overhead."
)
patch_pos_embed = patch_pos_embed.to(mindspore.bfloat16)

def mindspore_interpolate_bicubic(patch_pos_embed, scale_factor):
patch_pos_embed = patch_pos_embed.asnumpy()

target_size = (
int(patch_pos_embed.shape[2] * scale_factor[0]), int(patch_pos_embed.shape[3] * scale_factor[1]))
patch_pos_embed = np.transpose(patch_pos_embed, (0, 2, 3, 1))

# 使用 Resize 调整尺寸
resize = Resize(size=target_size, interpolation=Inter.BICUBIC)
output = resize(patch_pos_embed)

# 再次转换维度回到原始格式
output = np.transpose(output, (0, 3, 1, 2))
output = mindspore.Tensor(output)

return output

patch_pos_embed = mindspore_interpolate_bicubic(
patch_pos_embed,
scale_factor=(num_h_patches / sqrt_num_positions, num_w_patches / sqrt_num_positions),
)
if fp32_upcasting:
patch_pos_embed = patch_pos_embed.to(mindspore.bfloat16)
if int(num_h_patches) != patch_pos_embed.shape[-2] or int(num_w_patches) != patch_pos_embed.shape[-1]:
raise ValueError(
f"Number of patches for images ({int(num_h_patches), int(num_w_patches)}) don't match the "
f"shape of position embedding ({patch_pos_embed.shape[-2], patch_pos_embed.shape[-1]})"
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, embed_dim)
return ops.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

def forward(self, pixel_values: mindspore.Tensor, interpolate_pos_encoding: bool = False) -> mindspore.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if not interpolate_pos_encoding:
if height != self.image_size or width != self.image_size:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
)

target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]

patch_embeds = ops.transpose(patch_embeds.flatten(start_dim=2), 1, 2)

class_embeds = self.class_embedding.broadcast_to((batch_size, 1, -1))
embeddings = ops.cat([class_embeds, patch_embeds], dim=1)

# add positional encoding to each token
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)

return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->IdeficsVision
class IdeficsVisionAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config):

super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim ** -0.5
self.dropout = config.attention_dropout

self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
return ops.transpose(tensor.view(bsz, seq_len, self.num_heads, self.head_dim), 1, 2)

def forward(
self,
hidden_states: mindspore.Tensor,
attention_mask: Optional[mindspore.Tensor] = None,
causal_attention_mask: Optional[mindspore.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]:
"""Input shape: Batch x Time x Channel"""

bsz, tgt_len, embed_dim = hidden_states.shape

# get query proj
query_states = self.q_proj(hidden_states) * self.scale
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

proj_shape = (bsz * self.num_heads, -1, self.head_dim)
query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
key_states = key_states.view(*proj_shape)
value_states = value_states.view(*proj_shape)

src_len = key_states.shape[1]
attn_weights = ops.bmm(query_states, ops.transpose(key_states, 1, 2))

if attn_weights.shape != (bsz * self.num_heads, tgt_len, src_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
f" {attn_weights.shape}"
)

# apply the causal_attention_mask first
if causal_attention_mask is not None:
if causal_attention_mask.shape != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
f" {causal_attention_mask.shape}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

if attention_mask is not None:
if attention_mask.shape != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.shape}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

attn_weights = F.softmax(attn_weights, dim=-1)

if output_attentions:
# this operation is a bit akward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
else:
attn_weights_reshaped = None

attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)

attn_output = ops.bmm(attn_probs, value_states)

if attn_output.shape != (bsz * self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.shape}"
)

attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
attn_output = ops.transpose(attn_output, 1, 2)
attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)

attn_output = self.out_proj(attn_output)

return attn_output, attn_weights_reshaped


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
class IdeficsVisionMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

def forward(self, hidden_states: mindspore.Tensor) -> mindspore.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states


# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->IdeficsVision
class IdeficsVisionEncoderLayer(nn.Module):
def __init__(self, config: IdeficsVisionConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = IdeficsVisionAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = IdeficsVisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

def forward(
self,
hidden_states: mindspore.Tensor,
attention_mask: mindspore.Tensor,
causal_attention_mask: mindspore.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[mindspore.Tensor]:
"""
Args:
hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`mindspore.Tensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states

hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states

residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states

outputs = (hidden_states,)

if output_attentions:
outputs += (attn_weights,)

return outputs


# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->IdeficsVision
class IdeficsVisionEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`IdeficsVisionEncoderLayer`].

Args:
config: IdeficsVisionConfig
"""

def __init__(self, config: IdeficsVisionConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False

def forward(
self,
inputs_embeds,
attention_mask: Optional[mindspore.Tensor] = None,
causal_attention_mask: Optional[mindspore.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

[What are attention masks?](../glossary#attention-mask)
causal_attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Causal mask for the text model. Mask values selected in `[0, 1]`:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None

hidden_states = inputs_embeds

for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
attention_mask,
causal_attention_mask,
output_attentions,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
output_attentions=output_attentions,
)

hidden_states = layer_outputs[0]

if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)

if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)

if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)


# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer
class IdeficsVisionTransformer(nn.Module):
def __init__(self, config: IdeficsVisionConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size

self.embeddings = IdeficsVisionEmbeddings(config)

self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = IdeficsVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
def forward(
self,
pixel_values: Optional[mindspore.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:

"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if pixel_values is None:
raise ValueError("You have to specify pixel_values")

hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.pre_layrnorm(hidden_states)

encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)

last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)

if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]

return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)


__all__ = [
"IdeficsVisionTransformer"
]

+ 17
- 26
mindnlp/transformers/models/layoutlmv3/image_processing_layoutlmv3.py View File

@@ -13,12 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for LayoutLMv3."""

from typing import Dict, Iterable, Optional, Union
import pytesseract
import numpy as np
from mindnlp.utils import TensorType, is_vision_available, logging
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
from ....configs import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from ...image_utils import (
ChannelDimension,
ImageInput,
@@ -28,16 +30,24 @@ from ...image_utils import (
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
from ....utils import (
TensorType,
is_pytesseract_available,
is_vision_available,
logging,
requires_backends,
)


if is_vision_available():
import PIL

# soft dependency
IMAGENET_STANDARD_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STANDARD_STD = [0.229, 0.224, 0.225]
if is_pytesseract_available():
import pytesseract

logger = logging.get_logger(__name__)


@@ -159,23 +169,6 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
self._valid_processor_keys = [
"images",
"do_resize",
"size",
"resample",
"do_rescale",
"rescale_factor",
"do_normalize",
"image_mean",
"image_std",
"apply_ocr",
"ocr_lang",
"tesseract_config",
"return_tensors",
"data_format",
"input_data_format",
]

# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize(
@@ -243,7 +236,6 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
@@ -310,8 +302,6 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
images = make_list_of_images(images)

validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)

if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
@@ -343,7 +333,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):

# Tesseract OCR to get words + normalized bounding boxes
if apply_ocr:
#requires_backends(self, "pytesseract")
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
@@ -379,4 +369,5 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
data["words"] = words_batch
data["boxes"] = boxes_batch
return data

__all__ = ["LayoutLMv3ImageProcessor"]

+ 66
- 86
mindnlp/transformers/models/llama/configuration_llama.py View File

@@ -17,15 +17,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LLaMA model configuration"""
"""LLaMA model configuration"""

from mindnlp.utils import logging
from ...configuration_utils import PretrainedConfig


logger = logging.get_logger(__name__)

LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
from ...modeling_rope_utils import rope_config_validation


class LlamaConfig(PretrainedConfig):
@@ -54,7 +49,7 @@ class LlamaConfig(PretrainedConfig):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be forwarded
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
@@ -78,40 +73,70 @@ class LlamaConfig(PretrainedConfig):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://hf-mirror.com/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
experimental feature, subject to breaking API changes in future versions.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.

```python
>>> from transformers import LlamaModel, LlamaConfig

>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()

>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

Example:
```python
>>> from transformers import LlamaModel, LlamaConfig
...
>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()
...
>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "llama"
keys_to_ignore_at_inference = ["past_key_values"]

@@ -137,39 +162,9 @@ class LlamaConfig(PretrainedConfig):
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
**kwargs,
):
"""
This method initializes an instance of the LlamaConfig class.
Args:
vocab_size (int, optional): The size of the vocabulary. Default is 32000.
hidden_size (int, optional): The size of the hidden layers. Default is 4096.
intermediate_size (int, optional): The size of the intermediate layers. Default is 11008.
num_hidden_layers (int, optional): The number of hidden layers. Default is 32.
num_attention_heads (int, optional): The number of attention heads. Default is 32.
num_key_value_heads (int, optional): The number of key and value heads. If not provided, it defaults to num_attention_heads.
hidden_act (str, optional): The activation function for the hidden layers. Default is 'silu'.
max_position_embeddings (int, optional): The maximum position embeddings. Default is 2048.
initializer_range (float, optional): The range for weight initialization. Default is 0.02.
rms_norm_eps (float, optional): The epsilon value for RMS normalization. Default is 1e-06.
pretraining_tp (int, optional): The pretraining TP value. Default is 1.
use_cache (bool, optional): Indicates whether to use cache. Default is True.
pad_token_id (int, optional): The ID of the padding token.
bos_token_id (int, optional): The ID of the beginning of sequence token. Default is 1.
eos_token_id (int, optional): The ID of the end of sequence token. Default is 2.
tie_word_embeddings (bool, optional): Indicates whether to tie word embeddings. Default is False.
rope_theta (float, optional): The theta value for ROPE. Default is 10000.0.
rope_scaling (None or float, optional): The scaling value for ROPE. If provided, it should be validated.
attention_bias (bool, optional): Indicates whether to use attention bias. Default is False.
attention_dropout (float, optional): The dropout rate for attention. Default is 0.0.
Returns:
None.
Raises:
ValueError: If rope_scaling is provided and it does not pass the validation.
"""
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
@@ -189,9 +184,15 @@ class LlamaConfig(PretrainedConfig):
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias

# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)

super().__init__(
pad_token_id=pad_token_id,
@@ -201,25 +202,4 @@ class LlamaConfig(PretrainedConfig):
**kwargs,
)

def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
if self.rope_scaling is None:
return

if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

__all__ = ['LlamaConfig']

+ 627
- 1084
mindnlp/transformers/models/llama/modeling_llama.py
File diff suppressed because it is too large
View File


+ 49
- 219
mindnlp/transformers/models/llama/tokenization_llama.py View File

@@ -17,33 +17,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenization classes for LLaMA."""

import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

import sentencepiece as spm

from mindnlp.utils import logging
from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...tokenization_utils_base import TextInput
from ....utils import logging


if TYPE_CHECKING:
from ...tokenization_utils_base import TextInput

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"hf-internal-testing/llama-tokenizer": "https://hf-mirror.com/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
},
"tokenizer_file": {
"hf-internal-testing/llama-tokenizer": "https://hf-mirror.com/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"hf-internal-testing/llama-tokenizer": 2048,
}
SPIECE_UNDERLINE = "▁"

B_INST, E_INST = "[INST]", "[/INST]"
@@ -85,13 +79,13 @@ class LlamaTokenizer(PreTrainedTokenizer):
- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
BPE-dropout.

add_bos_token (`bool`, *optional*, defaults to `True`):
Whether or not to add an `bos_token` at the start of sequences.
@@ -107,30 +101,32 @@ class LlamaTokenizer(PreTrainedTokenizer):
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens.
Make sure to also set `from_slow` to `True`.
A simple example:
- `legacy=True`:
```python
>>> from transformers import T5Tokenizer
...
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
>>> tokenizer.encode("Hello <extra_id_0>.")
[8774, 32099, 3, 5, 1]
```
- `legacy=False`:
```python
>>> from transformers import T5Tokenizer
...
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

- `legacy=True`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
[1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 29889 is '.'
[1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*, defaults to `True`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]

def __init__(
@@ -147,38 +143,9 @@ class LlamaTokenizer(PreTrainedTokenizer):
use_default_system_prompt=False,
spaces_between_special_tokens=False,
legacy=None,
add_prefix_space=True,
**kwargs,
):
"""
Initializes a new instance of the LlamaTokenizer class.

Args:
self: The instance of the class.
vocab_file (str): The path to the vocabulary file.
unk_token (str, optional): The unknown token. Defaults to '<unk>'.
bos_token (str, optional): The beginning of sentence token. Defaults to '<s>'.
eos_token (str, optional): The end of sentence token. Defaults to '</s>'.
pad_token (str, optional): The padding token. Defaults to None.
sp_model_kwargs (Dict[str, Any], optional): Additional arguments for the sentencepiece model. Defaults to None.
add_bos_token (bool, optional): Whether to add the beginning of sentence token. Defaults to True.
add_eos_token (bool, optional): Whether to add the end of sentence token. Defaults to False.
clean_up_tokenization_spaces (bool, optional): Whether to clean up tokenization spaces. Defaults to False.
use_default_system_prompt (bool, optional): Whether to use the default system prompt. Defaults to False.
spaces_between_special_tokens (bool, optional): Whether to add spaces between special tokens. Defaults to False.
legacy (bool, optional): Whether to use the legacy behavior. Defaults to None.

Returns:
None.

Raises:
None.

Note:
You are using the default legacy behavior of the LlamaTokenizer. This means that the previous behavior
will be used, and nothing changes. If you want to use the new behavior, set `legacy=False`.
Only set this if you understand the implications and have thoroughly read the reason for this change
as explained in https://github.com/huggingface/transformers/pull/24565.
"""
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
@@ -191,7 +158,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565"
" https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message"
)
legacy = True

@@ -201,6 +169,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
self.add_eos_token = add_eos_token
self.use_default_system_prompt = use_default_system_prompt
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
self.add_prefix_space = add_prefix_space

super().__init__(
bos_token=bos_token,
@@ -214,52 +183,16 @@ class LlamaTokenizer(PreTrainedTokenizer):
use_default_system_prompt=use_default_system_prompt,
spaces_between_special_tokens=spaces_between_special_tokens,
legacy=legacy,
add_prefix_space=add_prefix_space,
**kwargs,
)

@property
def unk_token_length(self):
"""
Returns the length of the unknown token in the LlamaTokenizer.

Args:
self: An instance of the LlamaTokenizer class.

Returns:
int: The method returns the length of the unknown token as an integer value.

Raises:
None.

This method calculates and returns the length of the unknown token in the LlamaTokenizer.
The unknown token is represented as a string and is encoded using the sp_model.encode() method.
The length of the encoded unknown token is then determined using the len() function and returned as
an integer value. The method does not modify any internal state or variables of the LlamaTokenizer class.

Example:
```python
>>> tokenizer = LlamaTokenizer()
>>> unk_token_length = tokenizer.unk_token_length()
>>> print(unk_token_length) # Output: 5
```
"""
return len(self.sp_model.encode(str(self.unk_token)))

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self, from_slow=False):
"""
Retrieves the SentencePieceProcessor instance for the LlamaTokenizer.

Args:
self (LlamaTokenizer): The instance of LlamaTokenizer.
from_slow (bool): A flag indicating whether to load the tokenizer from a slow source. Defaults to False.

Returns:
None.

Raises:
None.
"""
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy or from_slow: # no dependency on protobuf
tokenizer.Load(self.vocab_file)
@@ -277,45 +210,12 @@ class LlamaTokenizer(PreTrainedTokenizer):
return tokenizer

def __getstate__(self):
"""
Method to serialize the state of the LlamaTokenizer instance for pickling.

Args:
self (LlamaTokenizer): The instance of the LlamaTokenizer class.
Represents the current instance of the tokenizer.

Returns:
None: This method does not explicitly return a value, but it updates the state of the tokenizer object.
The state dictionary contains a copy of the instance's attributes with modifications as needed for
serialization.

Raises:
None
"""
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state

def __setstate__(self, d):
"""
This method '__setstate__' in the class 'LlamaTokenizer' is responsible for restoring the state of the object
from a dictionary representation.

Args:
self (object): The instance of the class.
d (dict): A dictionary containing the state information to be restored.
It should include the necessary data to reforward the object's state.

Returns:
None: The method does not explicitly return any value,
as it operates by directly updating the object's state.

Raises:
TypeError: If the provided 'd' parameter is not a dictionary.
AttributeError: If the necessary attributes are not present in the dictionary 'd'.
ValueError: If there are issues with loading or reforwarding the 'sp_model' using the provided data.
"""
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
@@ -332,7 +232,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
return vocab

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
@@ -340,7 +240,11 @@ class LlamaTokenizer(PreTrainedTokenizer):
if self.legacy or len(text) == 0:
return super().tokenize(text, **kwargs)

tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
text = text.replace(SPIECE_UNDERLINE, " ")
if self.add_prefix_space:
text = SPIECE_UNDERLINE + text

tokens = super().tokenize(text, **kwargs)

if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
@@ -357,9 +261,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
tokens = self.sp_model.encode(text, out_type=str)
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return tokens
return self.sp_model.encode(text, out_type=str)

# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
@@ -378,7 +281,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# since we manually add the prefix space, we have to remove it when decoding
if tokens[0].startswith(SPIECE_UNDERLINE):
if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
tokens[0] = tokens[0][1:]

current_sub_tokens = []
@@ -393,6 +296,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
prev_is_special = True
current_sub_tokens = []
else:
if prev_is_special and i == 1 and self.add_prefix_space and not token.startswith(SPIECE_UNDERLINE):
out_string += " "
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
@@ -426,21 +331,6 @@ class LlamaTokenizer(PreTrainedTokenizer):
return (out_vocab_file,)

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
'''
This method builds inputs with special tokens for a LlamaTokenizer.

Args:
self: The instance of the LlamaTokenizer class.
token_ids_0: A list of token IDs representing the first sequence.
token_ids_1 (optional): A list of token IDs representing the second sequence.
Defaults to None if not provided.

Returns:
A list of token IDs with special tokens added at the beginning and end of the sequences.

Raises:
None
'''
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []

@@ -521,64 +411,4 @@ class LlamaTokenizer(PreTrainedTokenizer):

return output

@property
def default_chat_template(self):
"""
LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
results in an unusual token ordering when it is present. This template should definitely be changed if you wish
to fine-tune a model with more flexible role ordering!

The output should look something like:

<bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST]

The reference for this chat template is [this code
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://hf-mirror.com/docs/transformers/main/chat_templating for more information.\n"
)
template = (
"{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present
"{% set system_message = messages[0]['content'] %}"
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
"{% set loop_messages = messages %}" # Or use the default system message if the flag is set
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
"{% else %}"
"{% set loop_messages = messages %}"
"{% set system_message = false %}"
"{% endif %}"
"{% for message in loop_messages %}" # Loop over all non-system messages
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
"{% endif %}"
"{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
"{% else %}"
"{% set content = message['content'] %}"
"{% endif %}"
"{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
"{% elif message['role'] == 'system' %}"
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ ' ' + content.strip() + ' ' + eos_token }}"
"{% endif %}"
"{% endfor %}"
)
template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)

return template

__all__ = ['LlamaTokenizer']

+ 57
- 204
mindnlp/transformers/models/llama/tokenization_llama_fast.py View File

@@ -12,15 +12,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization classes for LLaMA."""
"""tokenization llama fast"""
import os
from shutil import copyfile
from typing import Optional, Tuple

from tokenizers import processors

from mindnlp.utils import is_sentencepiece_available, logging
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ....utils import is_sentencepiece_available, logging


if is_sentencepiece_available():
from .tokenization_llama import LlamaTokenizer
@@ -30,14 +31,6 @@ else:
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"hf-internal-testing/llama-tokenizer": "https://hf-mirror.com/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
},
"tokenizer_file": {
"hf-internal-testing/llama-tokenizer": "https://hf-mirror.com/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
},
}
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

@@ -57,19 +50,18 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):

This uses notably ByteFallback and no normalization.

Example:
```python
>>> from transformers import LlamaTokenizerFast
...
>>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
>>> tokenizer.encode("Hello this is a test")
[1, 15043, 445, 338, 263, 1243]
```
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
>>> tokenizer.encode("Hello this is a test")
[1, 15043, 445, 338, 263, 1243]
```

If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
[post-processors] (https://hf-mirror.com/docs/tokenizers/api/post-processors) documentation.
[post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
@@ -97,10 +89,35 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used.
Whether or not the default system prompt for Llama should be used
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens.
Make sure to also set `from_slow` to `True`.
A simple example:

- `legacy=True`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
[1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
>>> from transformers import LlamaTokenizerFast

>>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
>>> tokenizer.encode("Hello <s>.") # 29889 is '.'
[1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*):
Whether or not the tokenizer should automatically add a prefix space
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
slow_tokenizer_class = LlamaTokenizer
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
@@ -116,30 +133,25 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
legacy=None,
add_prefix_space=None,
**kwargs,
):
"""
Initializes a new instance of the LlamaTokenizerFast class.
Args:
self: The current instance of the class.
vocab_file (str): The path to the vocabulary file. Default is None.
tokenizer_file (str): The path to the tokenizer file. Default is None.
clean_up_tokenization_spaces (bool): Whether to clean up tokenization spaces. Default is False.
unk_token (str): The unknown token. Default is '<unk>'.
bos_token (str): The beginning of sentence token. Default is '<s>'.
eos_token (str): The end of sentence token. Default is '</s>'.
add_bos_token (bool): Whether to add the beginning of sentence token. Default is True.
add_eos_token (bool): Whether to add the end of sentence token. Default is False.
use_default_system_prompt (bool): Whether to use the default system prompt. Default is False.
**kwargs: Additional keyword arguments.
Returns:
None.
Raises:
None.
"""
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message."
)
legacy = True
self.legacy = legacy

if add_prefix_space is not None:
kwargs["from_slow"] = True

super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
@@ -150,6 +162,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
use_default_system_prompt=use_default_system_prompt,
add_prefix_space=add_prefix_space,
legacy=legacy,
**kwargs,
)
self._add_bos_token = add_bos_token
@@ -160,18 +174,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):

@property
def can_save_slow_tokenizer(self) -> bool:
"""
This method checks whether the slow tokenizer can be saved based on the existence of a vocabulary file.
Args:
self (LlamaTokenizerFast): The instance of the LlamaTokenizerFast class.
Returns:
bool: Returns True if the vocabulary file exists, otherwise False.
Raises:
None
"""
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def update_post_processor(self):
@@ -202,97 +204,23 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):

@property
def add_eos_token(self):
"""
Adds an end-of-sentence (EOS) token to the tokenizer.
Args:
self: An instance of the LlamaTokenizerFast class.
Returns:
None.
Raises:
None.
"""
return self._add_eos_token

@property
def add_bos_token(self):
"""
Method to add a beginning of sequence (BOS) token to the tokenizer.
Args:
self (LlamaTokenizerFast): The instance of the LlamaTokenizerFast class.
This parameter is mandatory as the method is an instance method.
Returns:
None.
Raises:
None.
"""
return self._add_bos_token

@add_eos_token.setter
def add_eos_token(self, value):
"""
Sets the value of the 'add_eos_token' attribute in the LlamaTokenizerFast class.
Args:
self (LlamaTokenizerFast): The instance of the LlamaTokenizerFast class.
value: The new value for the 'add_eos_token' attribute.
Returns:
None.
Raises:
None.
"""
self._add_eos_token = value
self.update_post_processor()

@add_bos_token.setter
def add_bos_token(self, value):
"""
Adds a beginning-of-sequence (BOS) token to the LlamaTokenizerFast object.
Args:
self (LlamaTokenizerFast): The instance of the LlamaTokenizerFast class.
value: The value to set for the add_bos_token attribute.
Returns:
None.
Raises:
None.
"""
self._add_bos_token = value
self.update_post_processor()

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Saves the vocabulary for a slow tokenizer.
Args:
self (LlamaTokenizerFast): An instance of the LlamaTokenizerFast class.
save_directory (str): The directory where the vocabulary will be saved.
filename_prefix (Optional[str]): An optional prefix to be added to the filename.
Defaults to None if not provided.
Returns:
Tuple[str]: A tuple containing the path to the saved vocabulary file.
Raises:
ValueError: If the fast tokenizer does not have the necessary information to save the vocabulary
for a slow tokenizer.
FileNotFoundError: If the save_directory does not exist.
Note:
The fast tokenizer must have the necessary information to save the vocabulary for a slow tokenizer.
The save_directory must be a valid directory.
The function will raise a ValueError if the fast tokenizer does not meet the requirements.
The function will raise a FileNotFoundError if the save_directory does not exist.
"""
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
@@ -311,84 +239,9 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):

return (out_vocab_file,)

@property
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
def default_chat_template(self):
"""
LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
results in an unusual token ordering when it is present. This template should definitely be changed if you wish
to fine-tune a model with more flexible role ordering!

The output should look something like:

<bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST]

The reference for this chat template is [this code
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository.
"""
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://hf-mirror.com/docs/transformers/main/chat_templating for more information.\n"
)
template = (
"{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present
"{% set system_message = messages[0]['content'] %}"
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
"{% set loop_messages = messages %}" # Or use the default system message if the flag is set
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
"{% else %}"
"{% set loop_messages = messages %}"
"{% set system_message = false %}"
"{% endif %}"
"{% for message in loop_messages %}" # Loop over all non-system messages
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
"{% endif %}"
"{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
"{% else %}"
"{% set content = message['content'] %}"
"{% endif %}"
"{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
"{% elif message['role'] == 'system' %}"
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ ' ' + content.strip() + ' ' + eos_token }}"
"{% endif %}"
"{% endfor %}"
)
template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)

return template

# TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
This method builds inputs with special tokens for the LlamaTokenizerFast class.
Args:
self: The instance of the LlamaTokenizerFast class.
token_ids_0 (list): The list of token IDs for the first sequence.
token_ids_1 (list, optional): The list of token IDs for the second sequence. Defaults to None.
Returns:
list: The list of token IDs representing the input sequences with special tokens added.
Raises:
None
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []



+ 58
- 0
mindnlp/transformers/models/nllb/__init__.py View File

@@ -0,0 +1,58 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NLLB init
"""
from typing import TYPE_CHECKING

from ....utils import (
OptionalDependencyNotAvailable,
is_sentencepiece_available,
is_tokenizers_available,
)


_import_structure = {}

try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_nllb"] = ["NllbTokenizer"]

try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_nllb_fast"] = ["NllbTokenizerFast"]


if TYPE_CHECKING:
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_nllb import NllbTokenizer

try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_nllb_fast import NllbTokenizerFast

+ 442
- 0
mindnlp/transformers/models/nllb/tokenization_nllb.py View File

@@ -0,0 +1,442 @@
# coding=utf-8
# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NLLBTokenizer
"""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple

import sentencepiece as spm

from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
from ....utils import logging


logger = logging.get_logger(__name__)

SPIECE_UNDERLINE = "▁"

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/nllb-200-distilled-600M": "https://hf-mirror.com/facebook/nllb-200-distilled-600M/blob/main/sentencepiece.bpe.model",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/nllb-200-distilled-600M": 1024,
}


FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab',
'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab',
'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng',
'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn',
'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn',
'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt',
'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn',
'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn',
'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt',
'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn',
'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn',
'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn',
'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn',
'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn',
'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn',
'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn',
'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva',
'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn',
'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl',
'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo',
'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn',
'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn',
'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym',
'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn',
'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn',
'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva',
'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn',
'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn',
'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn',
'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng',
'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn',
'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn',
'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn',
'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml',
'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai',
'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn',
'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn',
'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab',
'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn',
'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn'] # fmt: skip



class NllbTokenizer(PreTrainedTokenizer):
"""
Construct an NLLB tokenizer.

Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import NllbTokenizer

>>> tokenizer = NllbTokenizer.from_pretrained(
... "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
```

Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

<Tip>

When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.

</Tip>

eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.

<Tip>

When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.

</Tip>

sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
src_lang (`str`, *optional*):
The language to use as source language for translation.
tgt_lang (`str`, *optional*):
The language to use as target language for translation.
sp_model_kwargs (`Dict[str, str]`):
Additional keyword arguments to pass to the model initialization.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]

prefix_tokens: List[int] = []
suffix_tokens: List[int] = []

def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
tokenizer_file=None,
src_lang=None,
tgt_lang=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
additional_special_tokens=None,
legacy_behaviour=False,
**kwargs,
):
if additional_special_tokens is None:
additional_special_tokens = FAIRSEQ_LANGUAGE_CODES
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
# Mask token behave like a normal word, i.e. include the space before it
mask_token = (
AddedToken(mask_token, normalized=True, lstrip=True, special=True)
if isinstance(mask_token, str)
else mask_token
)

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.legacy_behaviour = legacy_behaviour

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
# Original fairseq vocab and spm vocab must be "aligned":
# Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
# -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ----
# fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | 'an' | '▁n' | '▁m' | '▁t' | '▁k' | '▁a'
# spm | '<unk>' | '<s>' | '</s>' | 'an' | '▁n' | '▁m' | '▁t' | '▁k' | '▁a' | '▁s'

# unk token needs to be in the vocab with correct index
self._added_tokens_decoder = {0: bos_token, 1: pad_token, 2: eos_token, 3: unk_token}
# The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
self.fairseq_offset = 1
self.sp_model_size = len(self.sp_model)

super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
tokenizer_file=tokenizer_file,
src_lang=src_lang,
tgt_lang=tgt_lang,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
legacy_behaviour=legacy_behaviour,
**kwargs,
)

self._src_lang = src_lang if src_lang is not None else "eng_Latn"
self.cur_lang_code_id = self.convert_tokens_to_ids(self._src_lang)
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)

def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
state["sp_model_proto"] = self.sp_model.serialized_model_proto()
return state

def __setstate__(self, d):
self.__dict__ = d

# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.LoadFromSerializedProto(self.sp_model_proto)

@property
def vocab_size(self):
return len(self.sp_model) + self.fairseq_offset

@property
def src_lang(self) -> str:
return self._src_lang

@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)

def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.

Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""

if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)

prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# We don't expect to process pairs, but leave the pair logic for API consistency
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of zeros.

"""

sep = [self.sep_token_id]
cls = [self.cls_token_id]

if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs

def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab

def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str)

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
spm_id = self.sp_model.PieceToId(token)
# Need to return unknown token if the SP model returned 0
return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index - self.fairseq_offset)

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)

return (out_vocab_file,)

def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng_Latn",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra_Latn",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)

def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)

def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting.
- In legacy mode: No prefix and suffix=[eos, src_lang_code].
- In default mode: Prefix=[src_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]

def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target lang setting.
- In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
- In default mode: Prefix=[tgt_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]

+ 395
- 0
mindnlp/transformers/models/nllb/tokenization_nllb_fast.py View File

@@ -0,0 +1,395 @@
# coding=utf-8
# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
NLLBTokenizer_fast
"""

import os
from shutil import copyfile
from typing import List, Optional, Tuple

from tokenizers import processors

from ...tokenization_utils import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ....utils import is_sentencepiece_available, logging


if is_sentencepiece_available():
from .tokenization_nllb import NllbTokenizer
else:
NllbTokenizer = None


logger = logging.get_logger(__name__)


VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/nllb-200-distilled-600M": "https://hf-mirror.com/facebook/nllb-200-distilled-600M/blob/main/sentencepiece.bpe.model",
},
"tokenizer_file": {
"facebook/nllb-200-distilled-600M": "https://hf-mirror.com/facebook/nllb-200-distilled-600M/blob/main/tokenizer.json",
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/nllb-large-en-ro": 1024,
"facebook/nllb-200-distilled-600M": 1024,
}

FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab',
'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn',
'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab',
'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn',
'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn',
'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl',
'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab',
'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn',
'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn',
'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn',
'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn',
'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn',
'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn',
'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn',
'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn',
'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr',
'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva',
'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn',
'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn',
'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn',
'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab',
'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn',
'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr',
'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn',
'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo',
'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn',
'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn',
'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn',
'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva',
'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn',
'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn',
'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn',
'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn',
'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya',
'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn',
'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn',
'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn',
'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr',
'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn',
'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn',
'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl',
'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn',
'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu',
'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi',
'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn',
'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn',
'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl',
'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn',
'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn',
'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn'] # fmt: skip


class NllbTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import NllbTokenizerFast

>>> tokenizer = NllbTokenizerFast.from_pretrained(
... "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
... )
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
```

Args:
vocab_file (`str`):
Path to the vocabulary file.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

<Tip>

When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the `cls_token`.

</Tip>

eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.

<Tip>

When building a sequence using special tokens, this is not the token that is used for the end of sequence.
The token used is the `sep_token`.

</Tip>

sep_token (`str`, *optional*, defaults to `"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
cls_token (`str`, *optional*, defaults to `"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (`str`, *optional*, defaults to `"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenizer_file (`str`, *optional*):
The path to a tokenizer file to use instead of the vocab file.
src_lang (`str`, *optional*):
The language to use as source language for translation.
tgt_lang (`str`, *optional*):
The language to use as target language for translation.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = NllbTokenizer

prefix_tokens: List[int] = []
suffix_tokens: List[int] = []

def __init__(
self,
vocab_file=None,
tokenizer_file=None,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
src_lang=None,
tgt_lang=None,
additional_special_tokens=None,
legacy_behaviour=False,
**kwargs,
):
if additional_special_tokens is None:
additional_special_tokens = FAIRSEQ_LANGUAGE_CODES

self.vocab_file = vocab_file
# Mask token behave like a normal word, i.e. include the space before it
mask_token = (
AddedToken(mask_token, normalized=True, lstrip=True, special=True)
if isinstance(mask_token, str)
else mask_token
)
self.legacy_behaviour = legacy_behaviour
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
src_lang=src_lang,
tgt_lang=tgt_lang,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
legacy_behaviour=legacy_behaviour,
**kwargs,
)

self._src_lang = src_lang if src_lang is not None else "eng_Latn"
self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@property
def src_lang(self) -> str:
return self._src_lang

@src_lang.setter
def src_lang(self, new_src_lang: str) -> None:
self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang)

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.

An NLLB sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# We don't expect to process pairs, but leave the pair logic for API consistency
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens

def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
make use of token type ids, therefore a list of zeros is returned.

Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.

Returns:
`List[int]`: List of zeros.

"""

sep = [self.sep_token_id]
cls = [self.cls_token_id]

if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

def _build_translation_inputs(
self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
):
"""Used by translation pipeline, to prepare inputs for the generate function"""
if src_lang is None or tgt_lang is None:
raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
self.src_lang = src_lang
inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
inputs["forced_bos_token_id"] = tgt_lang_id
return inputs

def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "eng_Latn",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "fra_Latn",
**kwargs,
) -> BatchEncoding:
self.src_lang = src_lang
self.tgt_lang = tgt_lang
return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)

def _switch_to_input_mode(self):
return self.set_src_lang_special_tokens(self.src_lang)

def _switch_to_target_mode(self):
return self.set_tgt_lang_special_tokens(self.tgt_lang)

def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting.
- In legacy mode: No prefix and suffix=[eos, src_lang_code].
- In default mode: Prefix=[src_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)

if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]

prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)

def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target lang setting.
- In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
- In default mode: Prefix=[tgt_lang_code], suffix = [eos]
"""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
if self.legacy_behaviour:
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
else:
self.prefix_tokens = [self.cur_lang_code]
self.suffix_tokens = [self.eos_token_id]

prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)

if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)

return (out_vocab_file,)

+ 378
- 879
mindnlp/transformers/models/qwen2/modeling_qwen2.py View File

@@ -17,24 +17,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""" MindSpore Qwen2 model."""
"""MindSpore Qwen2 model."""
import math
from typing import List, Optional, Tuple, Union

import numpy as np
import mindspore
from mindspore import Tensor, Parameter
from mindspore.common.initializer import initializer, Normal

from mindnlp.core import nn, ops, get_default_dtype
from mindnlp.core.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from mindnlp.core.nn import functional as F
from mindnlp.utils import logging
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from ...cache_utils import Cache, DynamicCache, StaticCache
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
SequenceClassifierOutputWithPast,
TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ....utils import logging
from ....configs import SUPPORT_VIEW, USE_PYBOOST
from .configuration_qwen2 import Qwen2Config


@@ -44,143 +48,98 @@ logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
_CONFIG_FOR_DOC = "Qwen2Config"

QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"Qwen/Qwen2-7B-beta",
# See all Qwen2 models at https://hf-mirror.com/models?filter=qwen2
]


# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
def _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask: mindspore.Tensor,
sequence_length: int,
target_length: int,
dtype: mindspore.dtype,
min_dtype: float,
cache_position: mindspore.Tensor,
batch_size: int,
):
"""
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
attention_mask (Tensor): A tensor representing the attention mask for the input sequences.
Its purpose is to indicate which tokens in the input sequences should be attended to and which should be ignored.
It should be a 2D tensor with a shape of (batch_size, sequence_length) and contain binary values (0 or 1).
Returns:
Tuple of Tensors:
The function returns a tuple containing the following:

- indices (Tensor): A 1D tensor containing the indices of the non-zero elements in the flattened
attention_mask tensor.
- cu_seqlens (Tensor): A 1D tensor representing the cumulative sum of the sequence lengths in the batch,
padded with a zero at the beginning.
- max_seqlen_in_batch (int): The maximum sequence length in the batch.

Raises:
None
attention_mask (`mindspore.Tensor`):
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
sequence_length (`int`):
The sequence length being processed.
target_length (`int`):
The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
min_dtype (`float`):
The minimum value representable with the dtype `dtype`.
cache_position (`mindspore.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`mindspore.Tensor`):
Batch size.
"""
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=mindspore.int32)
indices = ops.nonzero(attention_mask.flatten()).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = ops.pad(ops.cumsum(seqlens_in_batch, dim=0, dtype=mindspore.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
if attention_mask is not None and attention_mask.ndim == 4:
# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
causal_mask = attention_mask
else:
causal_mask = ops.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype)
if sequence_length != 1:
causal_mask = ops.triu(causal_mask, diagonal=1)
causal_mask *= ops.arange(target_length) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].broadcast_to((batch_size, 1, -1, -1))
if attention_mask is not None:
if SUPPORT_VIEW:
causal_mask = causal_mask.contiguous() # copy to contiguous memory for in-place edit
else:
causal_mask = causal_mask.copy()
mask_length = attention_mask.shape[-1]
# padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
padding_mask = ops.narrow(causal_mask, -1, 0, mask_length) + attention_mask[:, None, None, :]
padding_mask = padding_mask == 0
# causal_mask[:, :, :, :mask_length] = ops.narrow(causal_mask, -1, 0, mask_length).masked_fill(
# padding_mask, min_dtype
# )
if mask_length >= causal_mask.shape[-1]:
causal_mask = causal_mask.masked_fill(padding_mask, min_dtype)
else:
causal_mask = ops.cat(
[ops.narrow(causal_mask, -1, 0, mask_length).masked_fill(padding_mask, min_dtype),
ops.narrow(causal_mask, -1, mask_length, causal_mask.shape[-1] - mask_length)],
dim=-1
)

return causal_mask

# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
class Qwen2RMSNorm(nn.Module):

"""
Qwen2RMSNorm is a custom normalization layer that inherits from nn.Module. It is equivalent to T5LayerNorm and is
designed to normalize the input hidden states.

This class initializes with the specified hidden_size and an optional epsilon value for variance smoothing.
The normalization process involves scaling the hidden states based on the calculated variance and the provided
weight parameter.

The forward method takes hidden_states as input and performs the normalization operation, ensuring that the
output matches the input data type. The normalized hidden_states are then multiplied by the weight parameter to
produce the final output.

Note:
This docstring is based on the provided information and does not include actual code or signatures.
"""
def __init__(self, hidden_size, eps=1e-6):
"""
Qwen2RMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = Parameter(ops.ones(hidden_size))
self.weight = nn.Parameter(ops.ones(hidden_size))
self.variance_epsilon = eps

def forward(self, hidden_states):
"""
Constructs the RMS normalization of hidden states.

Args:
self (Qwen2RMSNorm): The instance of the Qwen2RMSNorm class.
hidden_states (Tensor): The input hidden states to be normalized.
Should be a tensor of any shape with dtype compatible with float32.

Returns:
None: The method modifies the hidden_states tensor in-place.

Raises:
ValueError: If hidden_states is not a valid tensor.
TypeError: If hidden_states dtype is not compatible with float32.
"""
if not self.training and USE_PYBOOST:
return F.rms_norm(hidden_states, self.weight, self.variance_epsilon)
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(mindspore.float32)
variance = hidden_states.pow(2).mean(-1, keep_dims=True)
variance = ops.mean(hidden_states.pow(2), -1, keepdim=True)
hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)

def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"

# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
class Qwen2RotaryEmbedding(nn.Module):

"""
Represents a Qwen2RotaryEmbedding module that inherits from nn.Module. This module implements the Qwen2Rotary
embedding as described in the code.

Attributes:
dim (int): The dimension of the embedding.
max_position_embeddings (int): The maximum position embeddings.
base (int): The base value used in the embedding calculation.

Methods:
_set_cos_sin_cache: Sets the cosine and sine cache for the given sequence length and data type.
forward(: Constructs the Qwen2Rotary embedding for the input with optional sequence length.

Note:
The Qwen2RotaryEmbedding module provides functionality for Qwen2Rotary embedding calculation, including setting
cosine and sine cache and forwarding the embedding.
"""
def __init__(self, dim, max_position_embeddings=2048, base=10000):
"""
Initializes a new instance of the Qwen2RotaryEmbedding class.

Args:
self: The object itself.
dim (int): The dimensionality of the embedding vectors.
max_position_embeddings (int, optional): The maximum number of position embeddings to generate.
Defaults to 2048.
base (int, optional): The base value used in the calculation of inverse frequency. Defaults to 10000.

Returns:
None.

Raises:
None.

This method initializes the Qwen2RotaryEmbedding object with the specified dimensionality,
maximum position embeddings, and base value. It calculates the inverse frequency based on the dimensionality
and stores it in the 'inv_freq' attribute. Additionally, it sets the cosine and sine cache based on the
maximum position embeddings.
"""
super().__init__()

self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (ops.arange(0, self.dim, 2, dtype=mindspore.int64).float() / self.dim))
self.inv_freq = inv_freq
self.register_buffer("inv_freq", inv_freq, persistent=False)

# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(
@@ -188,45 +147,16 @@ class Qwen2RotaryEmbedding(nn.Module):
)

def _set_cos_sin_cache(self, seq_len, dtype):
"""
Sets the cosine and sine cache for the Qwen2RotaryEmbedding class.

Args:
self (Qwen2RotaryEmbedding): The instance of the Qwen2RotaryEmbedding class.
seq_len (int): The length of the sequence.
dtype (dtype): The desired data type for the cache.

Returns:
None: This method updates the 'cos_cached' and 'sin_cached' attributes of the Qwen2RotaryEmbedding instance.

Raises:
None.
"""
self.max_seq_len_cached = seq_len
t = ops.arange(self.max_seq_len_cached, dtype=mindspore.int64).type_as(self.inv_freq)

freqs = ops.outer(t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = ops.cat((freqs, freqs), dim=-1)
self.cos_cached = emb.cos().to(dtype)
self.sin_cached = emb.sin().to(dtype)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

def forward(self, x, seq_len=None):
"""
Constructs the Qwen2RotaryEmbedding for the given input tensor 'x' and sequence length 'seq_len'.

Args:
self: The instance of the Qwen2RotaryEmbedding class.
x: A tensor representing the input data.
seq_len: An optional integer representing the length of the sequence. Defaults to None.

Returns:
None: This method modifies the internal state of the Qwen2RotaryEmbedding instance.

Raises:
ValueError: If 'seq_len' is not a positive integer.
TypeError: If the data type of 'x' is not supported for the internal calculations.
"""
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)
@@ -242,14 +172,12 @@ def rotate_half(x):
"""Rotates half the hidden dims of the input."""
# x1 = x[..., : x.shape[-1] // 2]
# x2 = x[..., x.shape[-1] // 2 :]
x1, x2 = x.tensor_split(2, -1)
x1, x2 = ops.split(x, x.shape[-1] // 2, dim=-1)
return ops.cat((-x2, x1), dim=-1)


# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""
Applies Rotary Position Embedding to the query and key tensors.
"""Applies Rotary Position Embedding to the query and key tensors.

Args:
q (`mindspore.Tensor`): The query tensor.
@@ -266,12 +194,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

Returns:
`tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
position_ids = (position_ids + cos.shape[0]) % cos.shape[0]
cos = F.embedding(position_ids, cos).unsqueeze(unsqueeze_dim)
sin = F.embedding(position_ids, sin).unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
@@ -279,50 +207,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):

# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
class Qwen2MLP(nn.Module):

"""
Qwen2MLP is a Python class that represents a multi-layer perceptron (MLP) with specific configurations for gate, up,
and down projections. This class inherits from nn.Module and is designed to be used in neural network models for
deep learning applications.

Attributes:
config: A configuration object containing settings for the hidden size and intermediate size of the MLP.
hidden_size: An integer representing the size of the hidden layer in the MLP.
intermediate_size: An integer representing the size of the intermediate layer in the MLP.
gate_proj: An instance of nn.Linear for projecting input data to the intermediate size with no bias.
up_proj: An instance of nn.Linear for projecting input data to the intermediate size with no bias.
down_proj: An instance of nn.Linear for projecting data from the intermediate size back to the hidden size with no bias.
act_fn: An activation function determined by the configuration settings.

Methods:
forward: A method that takes input data x and performs the forward pass through the MLP using the
defined projections and activation function.

Note:
The Qwen2MLP class is intended to be used as part of a larger neural network model and provides a configurable
multi-layer perceptron with specific projection and activation settings.
"""
def __init__(self, config):
"""
Initializes an instance of the Qwen2MLP class.

Args:
self: The instance of the class.
config: An object containing configuration parameters for the MLP.
It should have the following attributes:

- hidden_size: An integer specifying the size of the hidden layer.
- intermediate_size: An integer specifying the size of the intermediate layer.
- hidden_act: A string specifying the activation function to be used in the hidden layer.

Returns:
None

Raises:
None
"""
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
@@ -330,27 +216,8 @@ class Qwen2MLP(nn.Module):
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]

def forward(self, x):
"""
Constructs a new object using the Qwen2MLP class.

Args:
self: An instance of the Qwen2MLP class.
x: The input parameter of type 'Any', representing the data to be processed.

Returns:
This method returns None.

Raises:
None.

This method forwards a new object by performing a series of operations on the input data 'x'.
It first applies the 'gate_proj' function to 'x' and then applies the 'act_fn' function to the result.
The output of 'act_fn' is multiplied element-wise with the result of applying the 'down_proj' function to 'x'.
Finally, the result is multiplied with the output of applying the 'up_proj' function to 'x'.
The forwarded object is returned as None.
"""
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
def forward(self, hidden_state):
return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))


# Copied from transformers.models.llama.modeling_llama.repeat_kv
@@ -371,24 +238,8 @@ class Qwen2Attention(nn.Module):
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".
"""
def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
"""
Initializes an instance of the Qwen2Attention class.

Args:
self: The instance of the class.
config (Qwen2Config): An instance of the Qwen2Config class containing configuration parameters for
the attention mechanism.
layer_idx (Optional[int]): The index of the layer. Defaults to None. If None, a warning is logged
as it may lead to errors during forward call if caching is used. It is recommended to provide a
valid layer index when creating the class.

Returns:
None.

Raises:
ValueError: If the `hidden_size` is not divisible by `num_heads`.
"""
def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
@@ -432,47 +283,18 @@ class Qwen2Attention(nn.Module):
position_ids: Optional[mindspore.Tensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
**kwargs,
use_cache: bool = False,
cache_position: Optional[mindspore.Tensor] = None,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
'''
This method forwards the Qwen2Attention layer.

Args:
self: The instance of the class.
hidden_states (mindspore.Tensor): The input tensor of shape (batch_size, sequence_length, hidden_size).
attention_mask (Optional[mindspore.Tensor]): An optional tensor of shape
(batch_size, 1, sequence_length, key_value_sequence_length) containing indices to be masked.
position_ids (Optional[mindspore.Tensor]): An optional tensor of shape (batch_size, sequence_length)
containing the position indices of each token in the input sequence.
past_key_value (Optional[Cache]): An optional object representing the cached key and value tensors
from previous time steps.
output_attentions (bool): A flag indicating whether to return the attention weights.

Returns:
Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
A tuple containing:

- attn_output (mindspore.Tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
- attn_weights (Optional[mindspore.Tensor]): The attention weights tensor of shape
(batch_size, num_heads, sequence_length, key_value_sequence_length),
if output_attentions is True, else None.
- past_key_value (Optional[Tuple[mindspore.Tensor]]): The updated key and value tensors,
if past_key_value is not None and caching is enabled, else None.

Raises:
ValueError: If the cache structure has changed and the layer index is not provided,
if the shape of attention weights or attention mask is incorrect, or if the shape of the
output tensor is not as expected.
'''
bsz, q_len, _ = hidden_states.shape

query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)

query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
query_states = ops.transpose(query_states.view(bsz, q_len, self.num_heads, self.head_dim), 1, 2)
key_states = ops.transpose(key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)
value_states = ops.transpose(value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim), 1, 2)

kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
@@ -487,14 +309,14 @@ class Qwen2Attention(nn.Module):
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

if past_key_value is not None:
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

# repeat k/v heads if n_kv_heads < n_heads
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)

attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
attn_weights = ops.matmul(query_states, ops.transpose(key_states, 2, 3)) / math.sqrt(self.head_dim)

if attn_weights.shape != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
@@ -502,17 +324,13 @@ class Qwen2Attention(nn.Module):
f" {attn_weights.shape}"
)

if attention_mask is not None:
if attention_mask.shape != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
)

attn_weights = attn_weights + attention_mask
if attention_mask is not None: # no matter the length, we just slice it
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask

# upcast attention to fp32
attn_weights = ops.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=mindspore.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = ops.matmul(attn_weights, value_states)

if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
@@ -521,7 +339,7 @@ class Qwen2Attention(nn.Module):
f" {attn_output.shape}"
)

attn_output = attn_output.swapaxes(1, 2)
attn_output = ops.transpose(attn_output, 1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

attn_output = self.o_proj(attn_output)
@@ -538,55 +356,16 @@ QWEN2_ATTENTION_CLASSES = {


class Qwen2DecoderLayer(nn.Module):

"""
Qwen2DecoderLayer is a class representing a single layer of the Qwen2 decoder. It inherits from nn.Module and
contains methods for initializing the layer and forwarding the layer's operations.

Attributes:
hidden_size (int): The size of the hidden state.
self_attn (QWEN2_ATTENTION_CLASSES): The self-attention mechanism used in the layer.
mlp (Qwen2MLP): The multi-layer perceptron used in the layer.
input_layernorm (Qwen2RMSNorm): The layer normalization applied to the input.
post_attention_layernorm (Qwen2RMSNorm): The layer normalization applied after the attention mechanism.

Methods:
__init__: Initializes the Qwen2DecoderLayer with the given configuration and layer index.
forward:
Applies the layer operations to the input hidden_states and returns the resulting output tensor along with
optional additional tensors, such as attention weights and present key value.

Args:
hidden_states (mindspore.Tensor): Input to the layer of shape (batch, seq_len, embed_dim).
attention_mask (mindspore.Tensor, optional): Attention mask of size (batch, sequence_length)
where padding elements are indicated by 0.
output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers.
use_cache (bool, optional): If set to True, past_key_values key value states are returned and can be used to
speed up decoding.
past_key_value (Tuple(mindspore.Tensor), optional): Cached past key and value projection states.

Returns:
Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]: The output tensor and optional
additional tensors based on the input arguments.
"""
def __init__(self, config: Qwen2Config, layer_idx: int):
"""
Initializes a Qwen2DecoderLayer object.

Args:
self (Qwen2DecoderLayer): The instance of the Qwen2DecoderLayer class.
config (Qwen2Config): An object containing configuration settings for the decoder layer.
layer_idx (int): An integer representing the index of the layer.

Returns:
None.

Raises:
None.
"""
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = QWEN2_ATTENTION_CLASSES["eager"](config, layer_idx)

if config.sliding_window and config._attn_implementation != "flash_attention_2":
logger.warning_once(
f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
"unexpected results may be encountered."
)
self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

self.mlp = Qwen2MLP(config)
self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -600,6 +379,7 @@ class Qwen2DecoderLayer(nn.Module):
past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[mindspore.Tensor] = None,
**kwargs,
) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
"""
@@ -614,7 +394,13 @@ class Qwen2DecoderLayer(nn.Module):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(mindspore.Tensor)`, *optional*): cached past key and value projection states
cache_position (`mindspore.Tensor` of shape `(sequence_length)`, *optional*):
Indices depicting the position of the input sequence tokens in the sequence.
kwargs (`dict`, *optional*):
Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
into the model
"""

residual = hidden_states

hidden_states = self.input_layernorm(hidden_states)
@@ -627,6 +413,7 @@ class Qwen2DecoderLayer(nn.Module):
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)
hidden_states = residual + hidden_states

@@ -648,73 +435,23 @@ class Qwen2DecoderLayer(nn.Module):


class Qwen2PreTrainedModel(PreTrainedModel):

"""
This class represents a Qwen2PreTrainedModel, which is a subclass of PreTrainedModel.
It provides methods for initializing the weights of the model's cells.

Methods:
_init_weights:
Initializes the weights of a given cell.

Parameters:

- cell: The cell to initialize the weights for.

Returns:
None

Details:
The _init_weights method initializes the weights of the specified cell. It first checks the type of the cell.
If it is of type nn.Linear, it sets the weight data using the initializer function.
The initializer function takes the following parameters:

- Normal(self.config.initializer_range): A normal distribution initializer with the specified range.
- cell.weight.shape: The shape of the weight tensor.
- cell.weight.dtype: The data type of the weight tensor.

If the cell has a bias, it also sets the bias data using the initializer function with the following parameters:

- 'zeros': A zero initializer.
- cell.bias.shape: The shape of the bias tensor.
- cell.bias.dtype: The data type of the bias tensor.

If the cell is of type nn.Embedding, it generates random weights using the numpy random.normal function.
The parameters for the random.normal function are:

- 0.0: The mean of the normal distribution.
- self.config.initializer_range: The standard deviation of the normal distribution.
- cell.weight.shape: The shape of the weight tensor.

If the cell has a padding_idx, it sets the value at that index to 0.

Finally, the initialized weights are set to the cell using the Tensor function with the following parameters:

- weight: The initialized weight tensor.
- cell.weight.dtype: The data type of the weight tensor.
"""
config_class = Qwen2Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Qwen2DecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_cache_class = True

def _init_weights(self, cell):
"""Initialize the weights"""
if isinstance(cell, nn.Linear):
cell.weight.set_data(initializer(Normal(self.config.initializer_range),
cell.weight.shape, cell.weight.dtype))
if cell.bias is not None:
cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
elif isinstance(cell, nn.Embedding):
weight = np.random.normal(0.0, self.config.initializer_range, cell.weight.shape)
if cell.padding_idx:
weight[cell.padding_idx] = 0

cell.weight.set_data(Tensor(weight, cell.weight.dtype))
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
nn.init.normal_(module.weight, mean=0.0, std=std)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
nn.init.normal_(module.weight, mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx] = 0


class Qwen2Model(Qwen2PreTrainedModel):
@@ -724,31 +461,8 @@ class Qwen2Model(Qwen2PreTrainedModel):
Args:
config: Qwen2Config
"""
def __init__(self, config: Qwen2Config):
"""
Initializes a Qwen2Model instance.

Args:
self (Qwen2Model): The instance of the Qwen2Model class.
config (Qwen2Config):
An instance of Qwen2Config containing configuration parameters for the model.
It specifies the model configuration including the vocabulary size, hidden size, number of
hidden layers, padding token id, and RMS normalization epsilon.

The config object should have the following attributes:

- pad_token_id (int): The token id for padding.
- vocab_size (int): The size of the vocabulary.
- hidden_size (int): The size of the hidden layers.
- num_hidden_layers (int): The number of hidden layers in the model.
- rms_norm_eps (float): Epsilon value for RMS normalization.

Returns:
None.

Raises:
None.
"""
def __init__(self, config: Qwen2Config):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
@@ -757,6 +471,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
self.layers = nn.ModuleList(
[Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self._attn_implementation = config._attn_implementation
self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

self.gradient_checkpointing = False
@@ -764,40 +479,9 @@ class Qwen2Model(Qwen2PreTrainedModel):
self.post_init()

def get_input_embeddings(self):
"""
Method to retrieve the input embeddings from the Qwen2Model class.

Args:
self: An instance of the Qwen2Model class.
This parameter refers to the current instance of the Qwen2Model class.
It is used to access the embed tokens for input embeddings.

Returns:
None:
This method returns None as it simply provides access to the input embeddings.

Raises:
None.
"""
return self.embed_tokens

def set_input_embeddings(self, value):
"""
Sets the input embeddings for the Qwen2Model.

Args:
self: An instance of the Qwen2Model class.
value: The input embeddings to be set for the model. This should be of type torch.Tensor.

Returns:
None.

Raises:
None.

This method sets the input embeddings for the Qwen2Model by assigning the provided 'value' to the
'embed_tokens' attribute of the model instance.
"""
self.embed_tokens = value

def forward(
@@ -811,32 +495,8 @@ class Qwen2Model(Qwen2PreTrainedModel):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[mindspore.Tensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
"""
Construct method in the Qwen2Model class.

Args:
self (Qwen2Model): The instance of the Qwen2Model class.
input_ids (mindspore.Tensor, optional): The input tensor containing token IDs. Default is None.
attention_mask (mindspore.Tensor, optional): An optional tensor specifying the attention mask.
Default is None.
position_ids (mindspore.Tensor, optional): An optional tensor specifying the position IDs. Default is None.
past_key_values (List[mindspore.Tensor], optional): An optional list of tensors for past key values.
Default is None.
inputs_embeds (mindspore.Tensor, optional): An optional tensor containing input embeddings. Default is None.
use_cache (bool, optional): A flag indicating whether to use caching. Default is None.
output_attentions (bool, optional): A flag indicating whether to output attentions. Default is None.
output_hidden_states (bool, optional): A flag indicating whether to output hidden states. Default is None.
return_dict (bool, optional): A flag indicating whether to return a dictionary. Default is None.

Returns:
Union[Tuple, BaseModelOutputWithPast]:
Returns a tuple or BaseModelOutputWithPast object containing model outputs.

Raises:
ValueError: Raised if both input_ids and inputs_embeds are specified, or if neither is specified.
Warning: Raised if `use_cache=True` is incompatible with gradient checkpointing.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -845,15 +505,10 @@ class Qwen2Model(Qwen2PreTrainedModel):

return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
if input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
)

if self.gradient_checkpointing and self.training:
if use_cache:
@@ -862,32 +517,28 @@ class Qwen2Model(Qwen2PreTrainedModel):
)
use_cache = False

past_key_values_length = 0

if use_cache:
use_legacy_cache = not isinstance(past_key_values, Cache)
if use_legacy_cache:
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
past_key_values_length = past_key_values.get_usable_length(seq_length)

if position_ids is None:
position_ids = ops.arange(
past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
use_legacy_cache = False
if use_cache and not isinstance(past_key_values, Cache) and not self.training:
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
"We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
else:
position_ids = position_ids.view(-1, seq_length).long()

if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)

# 4d mask is passed through the layers
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
sliding_window=self.config.sliding_window,
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
cache_position = ops.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
)
if position_ids is None:
position_ids = cache_position.unsqueeze(0)

causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
)

hidden_states = inputs_embeds
@@ -901,14 +552,27 @@ class Qwen2Model(Qwen2PreTrainedModel):
if output_hidden_states:
all_hidden_states += (hidden_states,)

layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
decoder_layer.__call__,
hidden_states,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)

hidden_states = layer_outputs[0]

@@ -937,93 +601,71 @@ class Qwen2Model(Qwen2PreTrainedModel):
attentions=all_self_attns,
)

# Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
def _update_causal_mask(
self,
attention_mask: mindspore.Tensor,
input_tensor: mindspore.Tensor,
cache_position: mindspore.Tensor,
past_key_values: Cache,
output_attentions: bool,
):
# TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
# KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
# (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
# `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114

if self.config._attn_implementation == "flash_attention_2":
if attention_mask is not None and 0.0 in attention_mask:
return attention_mask
return None

# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
# to infer the attention mask.
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
using_static_cache = isinstance(past_key_values, StaticCache)

# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
if AttentionMaskConverter._ignore_causal_mask_sdpa(
attention_mask,
inputs_embeds=input_tensor,
past_key_values_length=past_seen_tokens,
is_training=self.training,
):
return None

class Qwen2ForCausalLM(Qwen2PreTrainedModel):
dtype = input_tensor.dtype
min_dtype = float(ops.finfo(dtype).min)
sequence_length = input_tensor.shape[1]
if using_static_cache:
target_length = past_key_values.get_max_length()
else:
target_length = (
attention_mask.shape[-1]
if isinstance(attention_mask, mindspore.Tensor)
else past_seen_tokens + sequence_length + 1
)

"""
This class represents a Qwen2 model for causal language modeling (LM). It is a subclass of Qwen2PreTrainedModel.
The Qwen2ForCausalLM class provides methods for initializing the model, setting and getting input and output
embeddings, setting and getting the decoder, forwarding the model, and preparing inputs for generation.

To initialize an instance of the Qwen2ForCausalLM class, a configuration object should be passed as a parameter
to the forwardor. The model's architecture and settings are defined by this configuration.

The Qwen2ForCausalLM class has the following methods:

- `__init__`: Initializes the Qwen2ForCausalLM instance with the given configuration.
- `get_input_embeddings`: Returns the input embeddings of the model.
- `set_input_embeddings`: Sets the input embeddings of the model to the given value.
- `get_output_embeddings`: Returns the output embeddings of the model.
- `set_output_embeddings`: Sets the output embeddings of the model to the given new_embeddings.
- `set_decoder`: Sets the decoder of the model to the given decoder.
- `get_decoder`: Returns the decoder of the model.
- `forward`: Constructs the model using the provided input arguments.
This method returns a tuple of outputs, including the logits and optionally the loss, past key values,
hidden states, and attentions.
- `prepare_inputs_for_generation`: Prepares the inputs for generation. This method takes input_ids, past_key_values,
attention_mask, inputs_embeds, and additional keyword arguments as input and returns a dictionary of model inputs.
- `_reorder_cache(past_key_values, beam_idx)`: Reorders the past key values according to the given beam indices.
This method is static and is used internally in the class.

Example:
```python
>>> from transformers import Qwen2ForCausalLM, Qwen2Config
...
>>> # Create a configuration object
>>> config = Qwen2Config(vocab_size=100, hidden_size=512)
...
>>> # Initialize a Qwen2ForCausalLM instance
>>> model = Qwen2ForCausalLM(config)
...
>>> # Set the input embeddings
>>> embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
>>> model.set_input_embeddings(embeddings)
...
>>> # Get the output embeddings
>>> output_embeddings = model.get_output_embeddings()
...
>>> # Set the decoder
>>> decoder = Qwen2Model(config)
>>> model.set_decoder(decoder)
...
>>> # Get the decoder
>>> decoder = model.get_decoder()
...
>>> # Construct the model
>>> input_ids = [1, 2, 3]
>>> attention_mask = [1, 1, 1]
>>> outputs = model.forward(input_ids=input_ids, attention_mask=attention_mask)
...
>>> # Prepare inputs for generation
>>> input_ids = [4, 5, 6]
>>> past_key_values = [tensor1, tensor2]
>>> attention_mask = [1, 1, 1]
>>> inputs_embeds = [embedding1, embedding2]
>>> model_inputs = model.prepare_inputs_for_generation(input_ids, past_key_values, attention_mask, inputs_embeds)
...
>>> # Reorder cache
>>> past_key_values = [tensor1, tensor2]
>>> beam_idx = [0, 1, 2]
>>> reordered_past = Qwen2ForCausalLM._reorder_cache(past_key_values, beam_idx)
```
"""
_tied_weights_keys = ["lm_head.weight"]
# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask,
sequence_length=sequence_length,
target_length=target_length,
dtype=dtype,
min_dtype=min_dtype,
cache_position=cache_position,
batch_size=input_tensor.shape[0],
)

def __init__(self, config):
"""
Initializes a new instance of the Qwen2ForCausalLM class.
return causal_mask

Args:
self: The object itself.
config: An instance of the Qwen2Config class containing the configuration settings for the model.
This parameter is required and must not be None.

Returns:
None
class Qwen2ForCausalLM(Qwen2PreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]

Raises:
None
"""
def __init__(self, config):
super().__init__(config)
self.model = Qwen2Model(config)
self.vocab_size = config.vocab_size
@@ -1033,120 +675,21 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
self.post_init()

def get_input_embeddings(self):
"""
Returns the input embeddings from the model.

Args:
self (Qwen2ForCausalLM): The object instance of the Qwen2ForCausalLM class.
This parameter represents the instance of the Qwen2ForCausalLM class, which contains the model
for which input embeddings are to be retrieved.

Returns:
None: This method returns None, as it directly accesses and returns the input embeddings from the model.

Raises:
None.
"""
return self.model.embed_tokens

def set_input_embeddings(self, value):
"""
Sets the input embeddings for the Qwen2ForCausalLM model.

Args:
self (Qwen2ForCausalLM): The instance of Qwen2ForCausalLM.
value (object): The input embeddings to be set for the model.
It can be an instance of a custom embedding class or any other object with
the required attributes and methods.

Returns:
None.

Raises:
None.
"""
self.model.embed_tokens = value

def get_output_embeddings(self):
"""
Method: get_output_embeddings

Description:
This method returns the output embeddings from the Qwen2ForCausalLM model.

Args:
self: Qwen2ForCausalLM object.
Represents the instance of the Qwen2ForCausalLM class.

Returns:
None
This method returns None.

Raises:
None
"""
return self.lm_head

def set_output_embeddings(self, new_embeddings):
"""Sets the output embeddings for the Qwen2ForCausalLM model.

Args:
self (Qwen2ForCausalLM): The instance of the Qwen2ForCausalLM class.
new_embeddings: The new embeddings to be set for the output layer.
This can be a tensor or any other object that can be assigned to the 'lm_head' attribute of the
Qwen2ForCausalLM instance.

Returns:
None.

Raises:
None.
"""
self.lm_head = new_embeddings

def set_decoder(self, decoder):
"""
Sets the decoder for the Qwen2ForCausalLM object.

Args:
self (Qwen2ForCausalLM): An instance of the Qwen2ForCausalLM class.
decoder: The decoder object to be set as the model for Qwen2ForCausalLM.
The decoder should implement the necessary methods and functionality required by Qwen2ForCausalLM.

Returns:
None.

Raises:
None.

Note:
The decoder object should be compatible with the Qwen2ForCausalLM class and fulfill the requirements
necessary for generating predictions or processing inputs.

Example:
```python
>>> qwen2 = Qwen2ForCausalLM()
>>> decoder = Decoder()
>>> qwen2.set_decoder(decoder)
```
"""
self.model = decoder

def get_decoder(self):
"""
Method to retrieve the decoder model from the Qwen2ForCausalLM class.

Args:
self (object): An instance of the Qwen2ForCausalLM class.
This parameter is required for accessing the decoder model.

Returns:
model:
The method returns the decoder model associated with the Qwen2ForCausalLM class.

Raises:
None.
"""
return self.model

def forward(
@@ -1161,6 +704,7 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[mindspore.Tensor] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
@@ -1170,24 +714,24 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Returns:
Union[Tuple, CausalLMOutputWithPast]

Example:
```python
>>> from transformers import AutoTokenizer, Qwen2ForCausalLM
...
>>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
...
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
...
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```
"""

```python
>>> from transformers import AutoTokenizer, Qwen2ForCausalLM

>>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1205,6 +749,7 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)

hidden_states = outputs[0]
@@ -1217,9 +762,11 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
shift_logits = logits[..., :-1, :]
shift_labels = labels[..., 1:]
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
loss = F.cross_entropy(shift_logits, shift_labels)
# Enable model parallelism
loss = loss_fct(shift_logits, shift_labels)

if not return_dict:
output = (logits,) + outputs[1:]
@@ -1233,68 +780,28 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
attentions=outputs.attentions,
)

# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
self,
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
cache_position=None,
position_ids=None,
use_cache=True,
**kwargs,
):
"""
Prepare inputs for generation.

Args:
self (Qwen2ForCausalLM): An instance of the Qwen2ForCausalLM class.
input_ids (torch.Tensor): The input token IDs of shape (batch_size, sequence_length).
past_key_values (Union[Cache, tuple, None]): The cached key-value states from previous generations.
If past_key_values is an instance of Cache, it contains information about the sequence length,
past length, and maximum cache length. If past_key_values is a tuple, it contains the past length.
If past_key_values is None, no cached key-value states are provided.
attention_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, sequence_length).
It helps to mask out tokens that should not be attended to, such as padding tokens.
inputs_embeds (torch.Tensor, optional): The input embeddings tensor of shape (batch_size, sequence_length, hidden_size).
It represents the embedded representation of the input tokens.
**kwargs: Additional keyword arguments.

Returns:
dict:
A dictionary containing the model inputs for generation:

- If inputs_embeds is not None and past_key_values is None, the dictionary contains
{'inputs_embeds': inputs_embeds}.
- Otherwise, the dictionary contains {'input_ids': input_ids}.
- The dictionary also includes 'position_ids', 'past_key_values', 'use_cache', and 'attention_mask'.

Raises:
None.
"""
# Omit tokens covered by past_key_values
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
# Exception 1: when passing input_embeds, input_ids may be missing entries
# Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
if past_key_values is not None:
if isinstance(past_key_values, Cache):
cache_length = past_key_values.get_seq_length()
past_length = past_key_values.seen_tokens
max_cache_length = past_key_values.get_max_length()
else:
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None

# Keep only the unprocessed tokens:
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
# input)
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
# input_ids based on the past_length.
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
if inputs_embeds is not None: # Exception 1
if 0 not in input_ids.shape:
input_ids = input_ids[:, -cache_position.shape[0] :]
elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
input_ids = ops.index_select(input_ids, -1, cache_position)

position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.int().cumsum(-1) - 1
@@ -1303,106 +810,44 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
position_ids = position_ids[:, -input_ids.shape[1] :]

# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
if inputs_embeds is not None and cache_position[0] == 0:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}

if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
if inputs_embeds is not None:
batch_size, sequence_length = inputs_embeds.shape
else:
batch_size, sequence_length = input_ids.shape

dtype = self.lm_head.weight.dtype
min_dtype = float(ops.finfo(dtype).min)

attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
attention_mask,
sequence_length=sequence_length,
target_length=past_key_values.get_max_length(),
dtype=dtype,
min_dtype=min_dtype,
cache_position=cache_position,
batch_size=batch_size,
)

model_inputs.update(
{
"position_ids": position_ids,
"cache_position": cache_position,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"use_cache": use_cache,
"attention_mask": attention_mask,
}
)
return model_inputs

@staticmethod
def _reorder_cache(past_key_values, beam_idx):
"""
Method to reorder the cache based on the beam index.

Args:
past_key_values (tuple): A tuple containing the past key values for each layer. Each element in the tuple
should be a tensor representing the past state for a layer.
beam_idx (torch.Tensor): A tensor containing the beam indices used for reordering the past states.

Returns:
None: This method modifies the input past_key_values in place and does not return any explicit value.

Raises:
ValueError: If the past_key_values or beam_idx are not in the expected format or shape.
IndexError: If the beam indices provided in beam_idx are out of bounds or not applicable to the
past_key_values.
"""
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),
)
return reordered_past


class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):

"""
Qwen2ForSequenceClassification is a class representing a sequence classification model that inherits from
Qwen2PreTrainedModel. It includes methods for initializing the model with a configuration, getting
and setting input embeddings, and forwarding the model for sequence classification.

Attributes:
num_labels (int): The number of labels for sequence classification.

Methods:
__init__: Initializes the sequence classification model with the given configuration.
get_input_embeddings: Retrieves the input embeddings from the model.
set_input_embeddings: Sets the input embeddings for the model.
forward: Constructs the sequence classification
model with the specified inputs and returns the sequence classifier output with past values.

Args:
input_ids (Tensor, optional): The input tensor of shape `(batch_size, sequence_length)`
representing the input sequence.
attention_mask (Tensor, optional): The attention mask tensor of shape `(batch_size, sequence_length)`
indicating which tokens should be attended to.
position_ids (Tensor, optional): The position IDs tensor of shape `(batch_size, sequence_length)`
representing the position of each token in the input sequence.
past_key_values (List[Tensor], optional): The list of past key values tensors for handling incremental decoding.
inputs_embeds (Tensor, optional): The input embeddings tensor of shape
`(batch_size, sequence_length, hidden_size)` representing the embedded input sequence.
labels (Tensor, optional): The tensor of shape `(batch_size,)` representing the labels for computing
the sequence classification/regression loss.
use_cache (bool, optional): Indicates whether to use the cache for handling incremental decoding.
output_attentions (bool, optional): Indicates whether to output attentions.
output_hidden_states (bool, optional): Indicates whether to output hidden states.
return_dict (bool, optional): Indicates whether to return a dictionary of outputs.

Returns:
Union[Tuple, SequenceClassifierOutputWithPast]: The sequence classifier output with past values.

Raises:
ValueError: If batch sizes > 1 and no padding token is defined.

Note:
This docstring is generated based on the provided code and is intended to provide a comprehensive understanding
of the Qwen2ForSequenceClassification class and its methods. Additional details and
specific usage instructions may be available in the official documentation or source code.
"""
def __init__(self, config):
"""
Initializes a new instance of the Qwen2ForSequenceClassification class.

Args:
self: The instance of the class.
config: An object of the Qwen2Config class containing the configuration settings for the model.

Returns:
None

Raises:
None
"""
super().__init__(config)
self.num_labels = config.num_labels
self.model = Qwen2Model(config)
@@ -1412,35 +857,9 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
self.post_init()

def get_input_embeddings(self):
"""
This method retrieves the input embeddings from the Qwen2ForSequenceClassification model.

Args:
self: An instance of the Qwen2ForSequenceClassification class.

Returns:
embed_tokens: The method returns the input embeddings from the model.

Raises:
This method does not raise any exceptions.
"""
return self.model.embed_tokens

def set_input_embeddings(self, value):
"""
Set the input embeddings for the Qwen2ForSequenceClassification model.

Args:
self (Qwen2ForSequenceClassification): The instance of the Qwen2ForSequenceClassification class.
value (object): The input embeddings to be set for the model.
Should be of type torch.Tensor or any compatible object.

Returns:
None.

Raises:
None.
"""
self.model.embed_tokens = value

def forward(
@@ -1457,11 +876,10 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
r"""
Args:
labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

@@ -1509,14 +927,17 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
self.config.problem_type = "multi_label_classification"

if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = F.mse_loss(pooled_logits.squeeze(), labels.squeeze())
loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
else:
loss = F.mse_loss(pooled_logits, labels)
loss = loss_fct(pooled_logits, labels)
elif self.config.problem_type == "single_label_classification":
loss = F.cross_entropy(pooled_logits.view(-1, self.num_labels), labels.view(-1))
loss_fct = CrossEntropyLoss()
loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss = F.binary_cross_entropy_with_logits(pooled_logits, labels)
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(pooled_logits, labels)
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
@@ -1529,9 +950,87 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
attentions=transformer_outputs.attentions,
)


# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2, LLAMA->QWEN2
class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.model = Qwen2Model(config)
if getattr(config, "classifier_dropout", None) is not None:
classifier_dropout = config.classifier_dropout
elif getattr(config, "hidden_dropout", None) is not None:
classifier_dropout = config.hidden_dropout
else:
classifier_dropout = 0.1
self.dropout = nn.Dropout(classifier_dropout)
self.score = nn.Linear(config.hidden_size, config.num_labels)

# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self):
return self.model.embed_tokens

def set_input_embeddings(self, value):
self.model.embed_tokens = value

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
position_ids: Optional[mindspore.Tensor] = None,
past_key_values: Optional[List[mindspore.Tensor]] = None,
inputs_embeds: Optional[mindspore.Tensor] = None,
labels: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
r"""
labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

outputs = self.model(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.score(sequence_output)

loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output

return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

__all__ = [
"Qwen2ForCausalLM",
"Qwen2Model",
"Qwen2PreTrainedModel",
"Qwen2ForSequenceClassification",
"Qwen2ForTokenClassification"
]

+ 25
- 0
mindnlp/transformers/models/rag/__init__.py View File

@@ -0,0 +1,25 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Rag Model """
from . import configuration_rag, modeling_rag, retrieval_rag, tokenization_rag
from .configuration_rag import *
from .modeling_rag import *
from .retrieval_rag import *
from .tokenization_rag import *

__all__ = []
__all__.extend(configuration_rag.__all__)
__all__.extend(modeling_rag.__all__)
__all__.extend(retrieval_rag.__all__)
__all__.extend(tokenization_rag.__all__)

+ 127
- 0
mindnlp/transformers/models/rag/configuration_rag.py View File

@@ -0,0 +1,127 @@
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAG model configuration"""

from ...configuration_utils import PretrainedConfig


class RagConfig(PretrainedConfig):
model_type = "rag"
is_composition = True

def __init__(
self,
vocab_size=None,
is_encoder_decoder=True,
prefix=None,
bos_token_id=None,
pad_token_id=None,
eos_token_id=None,
decoder_start_token_id=None,
title_sep=" / ",
doc_sep=" // ",
n_docs=5,
max_combined_length=300,
retrieval_vector_size=768,
retrieval_batch_size=8,
dataset="wiki_dpr",
dataset_split="train",
index_name="compressed",
index_path=None,
passages_path=None,
use_dummy_dataset=False,
reduce_loss=False,
label_smoothing=0.0,
do_deduplication=True,
exclude_bos_score=False,
do_marginalize=False,
output_retrieved=False,
use_cache=True,
forced_eos_token_id=None,
dataset_revision=None,
**kwargs,
):
super().__init__(
bos_token_id=bos_token_id,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
is_encoder_decoder=is_encoder_decoder,
prefix=prefix,
vocab_size=vocab_size,
**kwargs,
)
assert (
"question_encoder" in kwargs and "generator" in kwargs
), "Config has to be initialized with question_encoder and generator config"
question_encoder_config = kwargs.pop("question_encoder")
question_encoder_model_type = question_encoder_config.pop("model_type")
decoder_config = kwargs.pop("generator")
decoder_model_type = decoder_config.pop("model_type")

from ..auto.configuration_auto import AutoConfig

self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config)
self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config)

self.reduce_loss = reduce_loss
self.label_smoothing = label_smoothing
self.exclude_bos_score = exclude_bos_score
self.do_marginalize = do_marginalize

self.title_sep = title_sep
self.doc_sep = doc_sep
self.n_docs = n_docs
self.max_combined_length = max_combined_length

self.dataset = dataset
self.dataset_split = dataset_split
self.index_name = index_name

self.retrieval_vector_size = retrieval_vector_size
self.retrieval_batch_size = retrieval_batch_size
self.passages_path = passages_path
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision

self.output_retrieved = output_retrieved

self.do_deduplication = do_deduplication

self.use_cache = use_cache
self.forced_eos_token_id = forced_eos_token_id

if self.forced_eos_token_id is None:
self.forced_eos_token_id = getattr(self.generator, "forced_eos_token_id", None)

@classmethod
def from_question_encoder_generator_configs(
cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
r"""
Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
decoder model configuration.

Returns:
[`EncoderDecoderConfig`]: An instance of a configuration object
"""
return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)


__all__ = [
"RagConfig"
]

+ 1521
- 0
mindnlp/transformers/models/rag/modeling_rag.py View File

@@ -0,0 +1,1521 @@
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAG model implementation."""
# pylint: disable=unexpected-keyword-arg, not-callable, consider-using-generator

import copy
from dataclasses import dataclass
from typing import Callable, List, Optional, Tuple, Union

import mindspore
import numpy as np

import mindnlp.core.nn.functional as F
from mindnlp.core import ops
from mindnlp.utils import logging
from .configuration_rag import RagConfig
from .retrieval_rag import RagRetriever
from ...configuration_utils import PretrainedConfig
from ...generation import BeamSearchScorer, GenerationConfig, LogitsProcessorList, StoppingCriteriaList
from ...modeling_outputs import ModelOutput
from ...modeling_utils import PreTrainedModel

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "RagConfig"


@dataclass
class RetrievAugLMMarginOutput(ModelOutput):
"""
Base class for retriever augmented marginalized models outputs.

Args:
loss (`mindspore.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss.
logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
doc_scores (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`):
Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
`question_encoder_last_hidden_state`.
past_key_values (`List[mindspore.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `mindspore.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
num_heads, sequence_length, embed_size_per_head)`).

Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
(see `past_key_values` input) to speed up sequential decoding.
retrieved_doc_embeds (`mindspore.Tensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
the `doc_scores`.
retrieved_doc_ids (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
context_input_ids (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
context_attention_mask (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
question_encoder_last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
question_enc_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
question_enc_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_enc_last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
generator_enc_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_dec_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_cross_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""

loss: Optional[mindspore.Tensor] = None
logits: mindspore.Tensor = None
doc_scores: mindspore.Tensor = None
past_key_values: Optional[List[mindspore.Tensor]] = None
retrieved_doc_embeds: Optional[mindspore.Tensor] = None
retrieved_doc_ids: Optional[mindspore.Tensor] = None
context_input_ids: Optional[mindspore.Tensor] = None
context_attention_mask: Optional[mindspore.Tensor] = None
question_encoder_last_hidden_state: Optional[mindspore.Tensor] = None
question_enc_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
question_enc_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_enc_last_hidden_state: Optional[mindspore.Tensor] = None
generator_enc_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_enc_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_dec_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_dec_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_cross_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None


@dataclass
class RetrievAugLMOutput(ModelOutput):
"""
Args:
logits (`mindspore.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token.
doc_scores (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`):
Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
`question_encoder_last_hidden_state`.
past_key_values (`List[mindspore.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
List of `mindspore.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
num_heads, sequence_length, embed_size_per_head)`).

Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
(see `past_key_values` input) to speed up sequential decoding.
retrieved_doc_embeds (`mindspore.Tensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
the `doc_scores`.
retrieved_doc_ids (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
The indexes of the embedded documents retrieved by the retriever.
context_input_ids (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
context_attention_mask (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.
question_encoder_last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
model.
question_enc_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
question_enc_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_enc_last_hidden_state (`mindspore.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
generator_enc_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
generator_enc_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_dec_hidden_states (`tuple(mindspore.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `mindspore.Tensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`.

Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
generator_dec_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
generator_cross_attentions (`tuple(mindspore.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `mindspore.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.

Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
"""

logits: mindspore.Tensor = None
doc_scores: mindspore.Tensor = None
past_key_values: Optional[List[mindspore.Tensor]] = None
retrieved_doc_embeds: Optional[mindspore.Tensor] = None
retrieved_doc_ids: Optional[mindspore.Tensor] = None
context_input_ids: Optional[mindspore.Tensor] = None
context_attention_mask: Optional[mindspore.Tensor] = None
question_encoder_last_hidden_state: Optional[mindspore.Tensor] = None
question_enc_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
question_enc_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_enc_last_hidden_state: Optional[mindspore.Tensor] = None
generator_enc_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_enc_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_dec_hidden_states: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_dec_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None
generator_cross_attentions: Optional[Tuple[mindspore.Tensor, ...]] = None


class RagPreTrainedModel(PreTrainedModel):
r"""
RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
generator, the encoder and generator are trainable while the retriever is just an indexed dataset.

"""

config_class = RagConfig
base_model_prefix = "rag"

@classmethod
def from_pretrained(cls, *args, **kwargs):
# At the moment fast initialization is not supported
# for composite models
kwargs["_fast_init"] = False
return super().from_pretrained(*args, **kwargs)

@classmethod
def from_pretrained_question_encoder_generator(
cls,
question_encoder_pretrained_model_name_or_path: str = None,
generator_pretrained_model_name_or_path: str = None,
retriever: RagRetriever = None,
**kwargs,
) -> PreTrainedModel:
r"""
Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
model checkpoints.

The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
the model, you need to first set it back in training mode with `model.train()`.

Params:
question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
Information necessary to initiate the question encoder. Can be either:

- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
this case, `from_tf` should be set to `True` and a configuration object should be provided as
`config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
Information necessary to initiate the generator. Can be either:

- A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
- A path to a *directory* containing model weights saved using
[`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
- A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
this case, `from_tf` should be set to `True` and a configuration object should be provided as
`config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.

model_args (remaining positional arguments, *optional*):
All remaining positional arguments will be passed to the underlying model's `__init__` method.
retriever ([`RagRetriever`], *optional*):
The retriever to use.
kwwargs (remaining dictionary of keyword arguments, *optional*):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
`output_attentions=True`).

- To update the question_encoder configuration, use the prefix *question_encoder_* for each
configuration parameter.
- To update the generator configuration, use the prefix *generator_* for each configuration parameter.
- To update the parent model configuration, do not use a prefix for each configuration parameter.

Behaves differently depending on whether a `config` is provided or automatically loaded.

"""

kwargs_question_encoder = {
argument[len("question_encoder_"):]: value
for argument, value in kwargs.items()
if argument.startswith("question_encoder_")
}

kwargs_generator = {
argument[len("generator_"):]: value
for argument, value in kwargs.items()
if argument.startswith("generator_")
}

# remove question_encoder, generator kwargs from kwargs
for key in kwargs_question_encoder.keys():
del kwargs["question_encoder_" + key]
for key in kwargs_generator.keys():
del kwargs["generator_" + key]

# Load and initialize the question_encoder and generator
# The distinction between question_encoder and generator at the model level is made
# by the value of the flag `is_generator` that we need to set correctly.
question_encoder = kwargs_question_encoder.pop("model", None)
if question_encoder is None:
assert question_encoder_pretrained_model_name_or_path is not None, (
"If `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to"
" be defined"
)
from ..auto.modeling_auto import AutoModel

if "config" not in kwargs_question_encoder:
from ..auto.configuration_auto import AutoConfig

question_encoder_config, kwargs_question_encoder = AutoConfig.from_pretrained(
question_encoder_pretrained_model_name_or_path,
**kwargs_question_encoder,
return_unused_kwargs=True,
)
kwargs_question_encoder["config"] = question_encoder_config

question_encoder = AutoModel.from_pretrained(
question_encoder_pretrained_model_name_or_path, **kwargs_question_encoder
)

generator = kwargs_generator.pop("model", None)
if generator is None:
assert generator_pretrained_model_name_or_path is not None, (
"If `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has"
" to be defined"
)
from ..auto.modeling_auto import AutoModelForSeq2SeqLM

if "config" not in kwargs_generator:
from ..auto.configuration_auto import AutoConfig

generator_config, kwargs_generator = AutoConfig.from_pretrained(
generator_pretrained_model_name_or_path, **kwargs_generator, return_unused_kwargs=True
)

kwargs_generator["config"] = generator_config

generator = AutoModelForSeq2SeqLM.from_pretrained(
generator_pretrained_model_name_or_path, **kwargs_generator
)

# instantiate config with corresponding kwargs
config = kwargs.get("config", None)
if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)

return cls(question_encoder=question_encoder, generator=generator, config=config,
retriever=retriever) # pylint: disable=unexpected-keyword-arg


class RagModel(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None, # or maybe just use a `set_retriever(...)` method
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an question_encoder and a generator has to be provided."

if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
else:
assert isinstance(config, self.config_class), f"config: {config} has to be of type {self.config_class}"
super().__init__(config)
if question_encoder is None:
from ..auto.modeling_auto import AutoModel

question_encoder = AutoModel.from_config(
config.question_encoder
)

if generator is None:
from ..auto.modeling_auto import AutoModelForSeq2SeqLM

generator = AutoModelForSeq2SeqLM.from_config(
config.generator
)

self.retriever = retriever
if self.retriever is not None:
assert isinstance(
retriever, RagRetriever
), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
self.retriever = retriever

self.question_encoder = question_encoder
self.generator = generator

self.ctx_encoder = None
self.context_encoder_training = False

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
decoder_input_ids: Optional[mindspore.Tensor] = None,
decoder_attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
doc_scores: Optional[mindspore.Tensor] = None,
context_input_ids: Optional[mindspore.Tensor] = None,
context_attention_mask: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
n_docs: Optional[int] = None,
) -> Union[Tuple[mindspore.Tensor], RetrievAugLMOutput]:

n_docs = n_docs if n_docs is not None else self.config.n_docs
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
output_retrieved = output_retrieved if output_retrieved is not None else self.config.output_retrieved

# whether retriever has to be used
has_to_retrieve = (
self.retriever is not None
and (context_input_ids is None or context_attention_mask is None or doc_scores is None)
and encoder_outputs is None
)
# encoder_outputs are pre-computed during RAG-token generation
if encoder_outputs is None:
if has_to_retrieve:
question_enc_outputs = self.question_encoder(
input_ids, attention_mask=attention_mask, return_dict=True
)
question_encoder_last_hidden_state = question_enc_outputs[0] # hidden states of question encoder

retriever_outputs = self.retriever(
input_ids,
question_encoder_last_hidden_state.to(mindspore.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=n_docs,
return_tensors="ms",
)
if self.context_encoder_training:
(
context_input_ids,
context_attention_mask,
retrieved_doc_embeds,
retrived_doc_input_ids,
retrived_doc_attention_mask,
retrieved_doc_ids,
) = (
retriever_outputs["context_input_ids"],
retriever_outputs["context_attention_mask"],
retriever_outputs["retrieved_doc_embeds"],
retriever_outputs["tokenized_doc_ids"],
retriever_outputs["tokenized_doc_attention_mask"],
retriever_outputs["doc_ids"],
)

context_input_ids = context_input_ids.to(input_ids.dtype)
context_attention_mask = context_attention_mask.to(input_ids.dtype)

retrived_doc_input_ids = retrived_doc_input_ids.to(input_ids.dtype)
retrived_doc_attention_mask = retrived_doc_attention_mask.to(input_ids.dtype)
retrieved_doc_embeds = self.ctx_encoder(
retrived_doc_input_ids, attention_mask=retrived_doc_attention_mask, return_dict=True
).pooler_output
retrieved_doc_embeds = retrieved_doc_embeds.view(
-1, n_docs, question_encoder_last_hidden_state.shape[1]
) # reshaping

# compute doc_scores involving ctx_encoder
doc_scores = ops.bmm(
question_encoder_last_hidden_state.unsqueeze(1), ops.transpose(retrieved_doc_embeds, 1, 2)
).squeeze(1)

else:
context_input_ids, context_attention_mask, retrieved_doc_embeds, retrieved_doc_ids = (
retriever_outputs["context_input_ids"],
retriever_outputs["context_attention_mask"],
retriever_outputs["retrieved_doc_embeds"],
retriever_outputs["doc_ids"],
)

# set to correct device
retrieved_doc_embeds = retrieved_doc_embeds.to(question_encoder_last_hidden_state.dtype)
context_input_ids = context_input_ids.to(input_ids.dtype)
context_attention_mask = context_attention_mask.to(input_ids.dtype)

# compute doc_scores
doc_scores = ops.bmm(
question_encoder_last_hidden_state.unsqueeze(1), ops.transpose(retrieved_doc_embeds, 1, 2)
).squeeze(1)
else:
assert context_input_ids is not None, (
"Make sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can"
" set a retriever using the `set_retriever(...)` function."
)
assert context_attention_mask is not None, (
"Make sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you"
" can set a retriever using the `set_retriever(...)` function."
)
assert doc_scores is not None, (
"Make sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a"
" retriever using the `set_retriever(...)` function."
)

assert (
doc_scores is not None
), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."

assert (doc_scores.shape[1] % n_docs) == 0, (
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
f" {context_input_ids.shape[0]}."
)

# Decoder input without context documents
if decoder_input_ids is not None:
decoder_input_ids = ops.repeat_interleave(decoder_input_ids,n_docs, dim=0)

if decoder_attention_mask is not None:
decoder_attention_mask = decoder_attention_mask.astype(mindspore.int32)
decoder_attention_mask = ops.repeat_interleave(decoder_attention_mask,n_docs, dim=0)

gen_outputs = self.generator(
input_ids=context_input_ids,
attention_mask=context_attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
return_dict=True,
)

if not has_to_retrieve:
question_encoder_last_hidden_state = None
question_enc_hidden_states = None
question_enc_attentions = None
retrieved_doc_embeds = None
retrieved_doc_ids = None
else:
question_enc_hidden_states = question_enc_outputs.hidden_states
question_enc_attentions = question_enc_outputs.attentions

if not has_to_retrieve or not output_retrieved:
# don't output retrieved docs
context_input_ids = (None,)
context_attention_mask = None
retrieved_doc_embeds = None
retrieved_doc_ids = None

return RetrievAugLMOutput(
logits=gen_outputs.logits,
doc_scores=doc_scores,
past_key_values=gen_outputs.past_key_values,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
retrieved_doc_embeds=retrieved_doc_embeds,
retrieved_doc_ids=retrieved_doc_ids,
question_encoder_last_hidden_state=question_encoder_last_hidden_state,
question_enc_hidden_states=question_enc_hidden_states,
question_enc_attentions=question_enc_attentions,
generator_enc_last_hidden_state=gen_outputs.encoder_last_hidden_state,
generator_enc_hidden_states=gen_outputs.encoder_hidden_states,
generator_enc_attentions=gen_outputs.encoder_attentions,
generator_dec_hidden_states=gen_outputs.decoder_hidden_states,
generator_dec_attentions=gen_outputs.decoder_attentions,
generator_cross_attentions=gen_outputs.cross_attentions,
)


class RagSequenceForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."

if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)
super().__init__(config)

# instantiate model
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)

def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever

def set_context_encoder_for_training(self, ctx_encoder: PreTrainedModel):
self.rag.context_encoder_training = True
self.rag.ctx_encoder = ctx_encoder

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
decoder_input_ids: Optional[mindspore.Tensor] = None,
decoder_attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
context_input_ids: Optional[mindspore.Tensor] = None,
context_attention_mask: Optional[mindspore.Tensor] = None,
doc_scores: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
exclude_bos_score: Optional[bool] = None,
reduce_loss: Optional[bool] = None,
labels: Optional[mindspore.Tensor] = None,
n_docs: Optional[int] = None,
**kwargs, # needs kwargs for generation
) -> RetrievAugLMMarginOutput:
r"""
exclude_bos_score (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
the loss.
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `mindspore.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Legacy dictionary, which is required so that model can use *generate()* function.

Returns:

"""

n_docs = n_docs if n_docs is not None else self.config.n_docs
exclude_bos_score = exclude_bos_score if exclude_bos_score is not None else self.config.exclude_bos_score
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss

if labels is not None:
if decoder_input_ids is None:
decoder_input_ids = labels
use_cache = False

outputs = self.rag(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
doc_scores=doc_scores,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
output_retrieved=output_retrieved,
n_docs=n_docs,
)

loss = None
if labels is not None:
loss = self.get_nll(
outputs.logits,
outputs.doc_scores,
decoder_input_ids,
reduce_loss=reduce_loss,
epsilon=self.config.label_smoothing,
exclude_bos_score=exclude_bos_score,
n_docs=n_docs,
)

return RetrievAugLMMarginOutput(
loss=loss,
logits=outputs.logits,
doc_scores=outputs.doc_scores,
past_key_values=outputs.past_key_values,
context_input_ids=outputs.context_input_ids,
context_attention_mask=outputs.context_attention_mask,
retrieved_doc_embeds=outputs.retrieved_doc_embeds,
retrieved_doc_ids=outputs.retrieved_doc_ids,
question_encoder_last_hidden_state=outputs.question_encoder_last_hidden_state,
question_enc_hidden_states=outputs.question_enc_hidden_states,
question_enc_attentions=outputs.question_enc_attentions,
generator_enc_last_hidden_state=outputs.generator_enc_last_hidden_state,
generator_enc_hidden_states=outputs.generator_enc_hidden_states,
generator_enc_attentions=outputs.generator_enc_attentions,
generator_dec_hidden_states=outputs.generator_dec_hidden_states,
generator_dec_attentions=outputs.generator_dec_attentions,
generator_cross_attentions=outputs.generator_cross_attentions,
)

@property
def retriever(self):
return self.rag.retriever

@property
def generator(self):
return self.rag.generator

@property
def question_encoder(self):
return self.rag.question_encoder

@mindspore._no_grad()
def generate(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
context_input_ids: Optional[mindspore.Tensor] = None,
context_attention_mask: Optional[mindspore.Tensor] = None,
doc_scores: Optional[mindspore.Tensor] = None,
do_deduplication: Optional[bool] = None, # defaults to True
num_return_sequences: Optional[int] = None, # defaults to 1
num_beams: Optional[int] = None, # defaults to 1
n_docs: Optional[int] = None,
**model_kwargs,
) -> mindspore.Tensor:
"""
Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
for more information on how to set other generate input parameters.

Args:
input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
The sequence used as a prompt for the generation. If `input_ids` is not passed, then
`context_input_ids` has to be provided.
attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

[What are attention masks?](../glossary#attention-mask)
context_input_ids (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
retriever.
context_attention_mask (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.

If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
`context_attention_mask` have to be provided to the forward pass. They are returned by
[`~RagRetriever.__call__`].
doc_scores (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`):
Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
`question_encoder_last_hidden_state`.

If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
do_deduplication (`bool`, *optional*):
Whether or not to deduplicate the generations from different context documents for a given input. Has
to be set to `False` if used while training with distributed backend.
num_return_sequences(`int`, *optional*, defaults to 1):
The number of independently computed returned sequences for each element in the batch. Note that this
is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
where we set `num_return_sequences` to `num_beams`.
num_beams (`int`, *optional*, defaults to 1):
Number of beams for beam search. 1 means no beam search.
n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
kwargs (`Dict[str, Any]`, *optional*):
Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

Return:
`mindspore.Tensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
finished early due to the `eos_token_id`.
"""


n_docs = n_docs if n_docs is not None else self.config.n_docs
do_deduplication = do_deduplication if do_deduplication is not None else self.config.do_deduplication
num_doc_return_sequences = (
num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
)
num_beams = num_beams if num_beams is not None else self.config.num_beams

assert (
input_ids is not None or context_input_ids is not None
), " At least one of input_ids or context_input_ids must be given"

if self.retriever is not None and context_input_ids is None:
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[
0] # pylint: disable=not-callable
context_input_ids = self.retriever( # pylint: disable=not-callable
input_ids,
question_hidden_states.to(mindspore.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=n_docs,
return_tensors="ms",
)["context_input_ids"]

# set to correct device
context_input_ids = context_input_ids.to(input_ids.dtype)

hypos = []
model_kwargs["num_beams"] = num_beams
model_kwargs["num_return_sequences"] = num_beams
model_kwargs["attention_mask"] = None

batch_size = input_ids.shape[0] if input_ids is not None else context_input_ids.shape[0] // n_docs

for index in range(batch_size):
# first, generate beams from documents:
generator_input_ids = context_input_ids[index * n_docs: (index + 1) * n_docs] # (n_docs, max_len)
output_sequences = self.generator.generate(
generator_input_ids,
**model_kwargs,
) # n_docs * n_beam, tgt_len
if do_deduplication:
# do_deduplication, max_output_len
output_sequences = ops.stack(list({str(k.tolist()): k for k in output_sequences}.values()))

num_candidates = output_sequences.shape[
0
] # after deduplication, this number can be less than n_docs*n_beam

# then, run model forwards to get nll scores:
if input_ids is not None:

new_input_ids = ops.tile(input_ids[index: index + 1], (num_candidates, 1))
outputs = self(new_input_ids, labels=output_sequences, exclude_bos_score=True)
else: # input_ids is None, need context_input_ids/mask and doc_scores
assert context_attention_mask is not None, (
"Make sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you"
" can set a retriever using the `set_retriever(...)` function."
)
assert doc_scores is not None, (
"Make sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a"
" retriever using the `set_retriever(...)` function."
)

individual_input_ids = ops.tile(generator_input_ids, (
num_candidates, 1
)) # (num_candidates*n_docs, max_len)

individual_attention_mask = context_attention_mask[index * n_docs: (index + 1) * n_docs]
individual_attention_mask = ops.tile(individual_attention_mask, (num_candidates, 1))

individual_doc_scores = doc_scores[index: (index + 1), :] # doc_scores.shape = [batch, n_docs]
individual_doc_scores = ops.tile(individual_doc_scores, (num_candidates, 1)) # [num_candidates, n_docs]

outputs = self(
context_input_ids=individual_input_ids,
context_attention_mask=individual_attention_mask,
doc_scores=individual_doc_scores,
labels=output_sequences,
exclude_bos_score=True,
)

top_cand_inds = (-outputs["loss"]).topk(num_doc_return_sequences)[1]

# add hypothesis
hypos.append(output_sequences[top_cand_inds])

return self._cat_and_pad(hypos, pad_token_id=self.config.generator.pad_token_id)

def get_nll(
self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, exclude_bos_score=False, n_docs=None
):
# shift tokens left
new_fill = mindspore.Tensor((np.full((target.shape[0], 1), self.config.generator.pad_token_id)),
dtype=target[0].dtype)
target = ops.cat(
# [target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
[target[:, 1:], new_fill], 1
)

n_docs = n_docs if n_docs is not None else self.config.n_docs

# bos_token_id is None for T5
bos_token_id = self.config.bos_token_id or self.config.generator.bos_token_id
use_bos = bos_token_id is not None and target[:, 0].eq(bos_token_id).all()

def _mask_pads(ll, smooth_obj):
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill(pad_mask, 0.0)
smooth_obj.masked_fill(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)

# seq_logits dim = (batch*n_docs, tgt_len , #vocabs)
seq_logprobs = F.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.shape[-1]
) # batch_size x n_docs x tgt_len x #vocab_size
doc_logprobs = F.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)

# RAG-sequence marginalization
first_token_scores = seq_logprobs[:, :, :1, :]
second_token_scores = seq_logprobs[:, :, 1:2, :]
remainder = seq_logprobs[:, :, 2:, :]
rag_logprobs = ops.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)

# calculate loss
target = ops.tile(target.unsqueeze(1).unsqueeze(-1), (1, n_docs, 1, 1))
assert target.dim() == rag_logprobs.dim()

ll = ops.gather(rag_logprobs, dim=-1, index=target)
# ll = rag_logprobs.gather_nd(dim=-1, index=target)
smooth_obj = rag_logprobs.sum(axis=-1, keepdims=True) # total sum of all (normalised) logits

ll, smooth_obj = _mask_pads(ll, smooth_obj)

# sum over tokens, exclude bos while scoring
ll = ll[:, :, 1:].sum(2) if exclude_bos_score and use_bos else ll.sum(2)
smooth_obj = smooth_obj.sum(2)
ll = ll.logsumexp(1) # logsumexp over docs
smooth_obj = smooth_obj.logsumexp(1)

nll_loss = -ll
smooth_loss = -smooth_obj

if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()

eps_i = epsilon / rag_logprobs.shape[-1]
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss

@staticmethod
def _cat_and_pad(tensors, pad_token_id):
# output = (
# tensors[0].new(sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])).fill_(pad_token_id)
# )
new_fill = mindspore.Tensor(
np.full((sum([t.shape[0] for t in tensors]), max([t.shape[1] for t in tensors])), pad_token_id),
# pylint: disable=consider-using-generator
dtype=tensors[0].dtype)
output = (
new_fill
)
ind = 0
for t in tensors:
output[ind: ind + t.shape[0], : t.shape[1]] = t
ind += t.shape[0]
return output


class RagTokenForGeneration(RagPreTrainedModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
question_encoder: Optional[PreTrainedModel] = None,
generator: Optional[PreTrainedModel] = None,
retriever: Optional[RagRetriever] = None,
**kwargs,
):
assert config is not None or (
question_encoder is not None and generator is not None
), "Either a configuration or an encoder and a generator has to be provided."

if config is None:
config = RagConfig.from_question_encoder_generator_configs(
question_encoder.config, generator.config, **kwargs
)

super().__init__(config)

# instantiate model
self.rag = RagModel(config=config, question_encoder=question_encoder, generator=generator, retriever=retriever)

def set_retriever(self, retriever: RagRetriever):
self.rag.retriever = retriever

def set_context_encoder_for_training(self, ctx_encoder: PreTrainedModel):
self.rag.context_encoder_training = True
self.rag.ctx_encoder = ctx_encoder

def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
use_cache=None,
encoder_outputs=None,
doc_scores=None,
n_docs=None,
**kwargs,
):
if past_key_values is not None:
# if past is defined use only last decoder_input_ids
decoder_input_ids = decoder_input_ids[:, -1:]

return {
"input_ids": None,
"encoder_outputs": encoder_outputs,
"doc_scores": doc_scores,
"context_attention_mask": attention_mask,
"decoder_input_ids": decoder_input_ids,
"past_key_values": past_key_values,
"use_cache": use_cache,
"do_marginalize": True,
"n_docs": n_docs,
}

@property
def retriever(self):
return self.rag.retriever

@property
def generator(self):
return self.rag.generator

@property
def question_encoder(self):
return self.rag.question_encoder

@staticmethod
def _reorder_cache(past_key_values, beam_idx):
"""Reorders cache for generation. BART-inspired but we need to take care of the extra dimension for docs"""

def _reorder_stacked(hidden_states, new_order):
n_docs = hidden_states.shape[0] // new_order.shape[0]
hidden_states = hidden_states.view(-1, n_docs, *hidden_states.shape[1:])
hidden_states = hidden_states.index_select(0, new_order)
result = hidden_states.view(-1, *hidden_states.shape[2:])
return result

reordered_past = ()
for layer_past in past_key_values:
# get the correct batch idx from decoder layer's batch dim for cross and self-attn
reordered_past += (
tuple(_reorder_stacked(past_state, beam_idx) for past_state in layer_past),
)

return reordered_past

def marginalize(self, seq_logits, doc_scores, n_docs=None):
n_docs = n_docs if n_docs is not None else self.config.n_docs

# RAG-token marginalization
seq_logprobs = F.log_softmax(seq_logits, dim=-1).view(
seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.shape[-1]
)
doc_logprobs = F.log_softmax(doc_scores, dim=1)
log_prob_sum = seq_logprobs + doc_logprobs.unsqueeze(-1).unsqueeze(-1)
# return ops.logsumexp(log_prob_sum, dim=1)
return mindspore.ops.logsumexp(log_prob_sum, axis=1)

def forward(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
decoder_input_ids: Optional[mindspore.Tensor] = None,
decoder_attention_mask: Optional[mindspore.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
context_input_ids: Optional[mindspore.Tensor] = None,
context_attention_mask: Optional[mindspore.Tensor] = None,
doc_scores: Optional[mindspore.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_retrieved: Optional[bool] = None,
do_marginalize: Optional[bool] = None,
reduce_loss: Optional[bool] = None,
labels: Optional[mindspore.Tensor] = None,
n_docs: Optional[int] = None,
**kwargs, # needs kwargs for generation
) -> RetrievAugLMMarginOutput:
r"""
do_marginalize (`bool`, *optional*):
If `True`, the logits are marginalized over all documents by making use of
`ops.F.log_softmax`.
reduce_loss (`bool`, *optional*):
Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `mindspore.Tensor.sum`
operation.
kwargs (`Dict[str, any]`, optional, defaults to *{}*):
Legacy dictionary, which is required so that model can use *generate()* function.

Returns:
"""
n_docs = n_docs if n_docs is not None else self.config.n_docs
do_marginalize = do_marginalize if do_marginalize is not None else self.config.do_marginalize
reduce_loss = reduce_loss if reduce_loss is not None else self.config.reduce_loss

if labels is not None:
if decoder_input_ids is None:
decoder_input_ids = labels
use_cache = False

outputs = self.rag(
input_ids=input_ids,
attention_mask=attention_mask,
encoder_outputs=encoder_outputs,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
context_input_ids=context_input_ids,
context_attention_mask=context_attention_mask,
doc_scores=doc_scores,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
output_retrieved=output_retrieved,
n_docs=n_docs,
)

loss = None
logits = outputs.logits
if labels is not None:
assert decoder_input_ids is not None
loss = self.get_nll(
outputs.logits,
outputs.doc_scores,
labels,
reduce_loss=reduce_loss,
epsilon=self.config.label_smoothing,
n_docs=n_docs,
)

if do_marginalize:
logits = self.marginalize(logits, outputs.doc_scores, n_docs)

return RetrievAugLMMarginOutput(
loss=loss,
logits=logits,
doc_scores=outputs.doc_scores,
past_key_values=outputs.past_key_values,
context_input_ids=outputs.context_input_ids,
context_attention_mask=outputs.context_attention_mask,
retrieved_doc_embeds=outputs.retrieved_doc_embeds,
retrieved_doc_ids=outputs.retrieved_doc_ids,
question_encoder_last_hidden_state=outputs.question_encoder_last_hidden_state,
question_enc_hidden_states=outputs.question_enc_hidden_states,
question_enc_attentions=outputs.question_enc_attentions,
generator_enc_last_hidden_state=outputs.generator_enc_last_hidden_state,
generator_enc_hidden_states=outputs.generator_enc_hidden_states,
generator_enc_attentions=outputs.generator_enc_attentions,
generator_dec_hidden_states=outputs.generator_dec_hidden_states,
generator_dec_attentions=outputs.generator_dec_attentions,
generator_cross_attentions=outputs.generator_cross_attentions,
)

@mindspore._no_grad()
def generate(
self,
input_ids: Optional[mindspore.Tensor] = None,
attention_mask: Optional[mindspore.Tensor] = None,
context_input_ids: Optional[mindspore.Tensor] = None,
context_attention_mask: Optional[mindspore.Tensor] = None,
doc_scores: Optional[mindspore.Tensor] = None,
n_docs: Optional[int] = None,
generation_config: Optional[GenerationConfig] = None,
prefix_allowed_tokens_fn: Callable[[int, mindspore.Tensor], List[int]] = None,
logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
**kwargs,
) -> mindspore.Tensor:
"""
Implements RAG token decoding.

Args:
input_ids (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
The sequence used as a prompt for the generation. If `input_ids` is not passed, then
`context_input_ids` has to be provided.
attention_mask (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.

[What are attention masks?](../glossary#attention-mask)
context_input_ids (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.

If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
context_attention_mask (`mindspore.Tensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
retriever.

If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
doc_scores (`mindspore.Tensor` of shape `(batch_size, config.n_docs)`):
Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
`question_encoder_last_hidden_state`.

If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
n_docs (`int`, *optional*, defaults to `config.n_docs`)
Number of documents to retrieve and/or number of documents for which to generate an answer.
generation_config (`~generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, the default will be used, which has the following loading
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
prefix_allowed_tokens_fn (`Callable[[int, mindspore.Tensor], List[int]]`, *optional*):
If provided, this function constraints the beam search to allowed tokens only at each step. If not
provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
`batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
constrained generation conditioned on the prefix, as described in [Autoregressive Entity
Retrieval](https://arxiv.org/abs/2010.00904).
logits_processor (`LogitsProcessorList`, *optional*):
Custom logits processors that complement the default logits processors built from arguments and a
model's config. If a logit processor is passed that is already created with the arguments or a model's
config an error is thrown.
stopping_criteria (`StoppingCriteriaList`, *optional*):
Custom stopping criteria that complement the default stopping criteria built from arguments and a
model's config. If a stopping criteria is passed that is already created with the arguments or a
model's config an error is thrown.
kwargs (`Dict[str, Any]`, *optional*):
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
forwarded to the `forward` function of the model.

Return:
`mindspore.Tensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
finished early due to the `eos_token_id`.
"""
# Handle `generation_config` and kwargs that might update it
if generation_config is None:
generation_config = self.generation_config
generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs

kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None

def _prepare_special_tokens(
self,
generation_config: GenerationConfig,
kwargs_has_attention_mask: Optional[bool] = None,
):
"""
Prepares the special tokens for generation, overwriting the generation config with their processed versions
converted to tensor.

Note that `generation_config` is changed in place and stops being serializable after this method is called.
That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
"""

# Convert special tokens to tensors
def _tensor_or_none(token):
if token is None:
return token
if isinstance(token, mindspore.Tensor):
return token
return mindspore.Tensor(token, dtype=mindspore.int64)

bos_token_tensor = _tensor_or_none(generation_config.bos_token_id)
eos_token_tensor = _tensor_or_none(generation_config.eos_token_id)
pad_token_tensor = _tensor_or_none(generation_config.pad_token_id)
decoder_start_token_tensor = _tensor_or_none(generation_config.decoder_start_token_id)

# for BC we also try to get `decoder_start_token_id` or `bos_token_id` (#30892)
if self.config.is_encoder_decoder:
decoder_start_token_tensor = (
decoder_start_token_tensor if decoder_start_token_tensor is not None else bos_token_tensor
)

# We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
if eos_token_tensor is not None and eos_token_tensor.ndim == 0:
eos_token_tensor = eos_token_tensor.unsqueeze(0)

# Set pad token if unset (and there are conditions to do so)
if pad_token_tensor is None and eos_token_tensor is not None:
if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
logger.warning(
"The attention mask and the pad token id were not set. As a consequence, you may observe "
"unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
)
pad_token_tensor = eos_token_tensor[0]
logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")

# we can't infer attn mask if pad token is set to be eos token in model's generation config
if eos_token_tensor is not None and pad_token_tensor in eos_token_tensor:
if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
logger.warning_once(
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token."
"As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` "
"to obtain reliable results."
)

# Sanity checks/warnings
if self.config.is_encoder_decoder and decoder_start_token_tensor is None:
raise ValueError(
"`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
)
if eos_token_tensor is not None and (
ops.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any()
):
logger.warning(
f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation will not "
"stop until the maximum length is reached. Depending on other flags, it may even crash."
)

generation_config._bos_token_tensor = bos_token_tensor
generation_config._eos_token_tensor = eos_token_tensor
generation_config._pad_token_tensor = pad_token_tensor
generation_config._decoder_start_token_tensor = decoder_start_token_tensor

_prepare_special_tokens(self, generation_config, kwargs_has_attention_mask)

# set default parameters
n_docs = n_docs if n_docs is not None else self.config.n_docs

# retrieve docs
if self.retriever is not None and context_input_ids is None:
question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[
0] # pylint: disable=not-callable
out = self.retriever( # pylint: disable=not-callable
input_ids,
question_hidden_states.to(mindspore.float32).numpy(),
prefix=self.generator.config.prefix,
n_docs=n_docs,
return_tensors="ms",
)
context_input_ids, context_attention_mask, retrieved_doc_embeds = (
out["context_input_ids"],
out["context_attention_mask"],
out["retrieved_doc_embeds"],
)

# set to correct device
retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states.dtype)
context_input_ids = context_input_ids.to(input_ids.dtype)
context_attention_mask = context_attention_mask.to(input_ids.dtype)

# compute doc_scores
doc_scores = ops.bmm(question_hidden_states.unsqueeze(1),
ops.transpose(retrieved_doc_embeds, 1, 2)).squeeze(
1
)

assert (context_input_ids.shape[0] % n_docs) == 0, (
f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
f" {context_input_ids.shape[0]}."
)

# batch_size
batch_size = context_input_ids.shape[0] // n_docs

encoder = self.rag.generator.get_encoder()
encoder_outputs = encoder(input_ids=context_input_ids, attention_mask=context_attention_mask, return_dict=True)

input_ids = ops.full(
(batch_size * generation_config.num_beams, 1),
generation_config.decoder_start_token_id,
dtype=mindspore.int64,
)
input_ids_seq_length = input_ids.shape[-1]
last_hidden_state = encoder_outputs["last_hidden_state"]

def extend_enc_output(tensor, num_beams=None):
# split into `batch_size`, `num_beams`, `num_docs`
tensor = tensor[None, None, :].reshape((batch_size, 1, n_docs) + tensor.shape[1:])
# repeat same last hidden states over `num_beams` dimension
tensor = tensor.broadcast_to((batch_size, num_beams, n_docs) + tensor.shape[3:])
# merge `batch_size`, `num_beams`, `num_docs` dims again
return tensor.reshape((batch_size * num_beams * n_docs,) + tensor.shape[3:])

# correctly extend last_hidden_state and attention mask
context_attention_mask = extend_enc_output(context_attention_mask, num_beams=generation_config.num_beams)
encoder_outputs["last_hidden_state"] = extend_enc_output(
last_hidden_state, num_beams=generation_config.num_beams
)

doc_scores = ops.repeat_interleave(doc_scores,generation_config.num_beams, dim=0)

# define start_len & additional parameters
model_kwargs["doc_scores"] = doc_scores
model_kwargs["encoder_outputs"] = encoder_outputs
model_kwargs["attention_mask"] = context_attention_mask
model_kwargs["n_docs"] = n_docs

pre_processor = self._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_seq_length,
encoder_input_ids=context_input_ids,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
logits_processor=logits_processor,
)

prepared_stopping_criteria = self._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria
)

if generation_config.num_beams == 1:
if generation_config.num_return_sequences > 1:
raise ValueError(
f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
" greedy search."
)
return self._sample(
input_ids,
logits_processor=pre_processor,
stopping_criteria=prepared_stopping_criteria,
generation_config=generation_config,
synced_gpus=False,
streamer=None,
logits_warper=None,
**model_kwargs,
)
elif generation_config.num_beams > 1:
if generation_config.num_return_sequences > generation_config.num_beams:
raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
beam_scorer = BeamSearchScorer(
batch_size=batch_size,
num_beams=generation_config.num_beams,
length_penalty=generation_config.length_penalty,
do_early_stopping=generation_config.early_stopping,
num_beam_hyps_to_keep=generation_config.num_return_sequences,
max_length=generation_config.max_length,
)
return self._beam_search(
input_ids,
beam_scorer,
logits_processor=pre_processor,
stopping_criteria=prepared_stopping_criteria,
generation_config=generation_config,
synced_gpus=False,
logits_warper=None,
**model_kwargs,
)
else:
raise ValueError(
f"`num_beams` has to be an integer strictly superior to 0 (≥ 1), but is {generation_config.num_beams}"
)

def get_input_embeddings(self):
return self.rag.generator.get_input_embeddings()

def get_output_embeddings(self):
return self.rag.generator.get_output_embeddings()

def set_output_embeddings(self, new_embeddings):
return self.rag.generator.set_output_embeddings(new_embeddings)

def shift_tokens_right(self, input_ids, start_token_id=None):
"""Shift input ids one token to the right, and pad with start_token_id"""
if start_token_id is None:
start_token_id = self.config.decoder_start_token_id
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = start_token_id
return shifted_input_ids

def get_nll(self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, n_docs=None):
n_docs = n_docs if n_docs is not None else self.config.n_docs
# shift tokens left
new_fill = mindspore.Tensor((np.full((target.shape[0], 1), self.config.generator.pad_token_id)),
dtype=target[0].dtype)
target = ops.cat(
[target[:, 1:],new_fill], 1
# [target[:, 1:], target.new(target.shape[0], 1).fill_(self.config.generator.pad_token_id)], 1
)

def _mask_pads(ll, smooth_obj):
pad_mask = target.eq(self.config.generator.pad_token_id)
if pad_mask.any():
ll.masked_fill(pad_mask, 0.0)
smooth_obj.masked_fill(pad_mask, 0.0)
return ll.squeeze(-1), smooth_obj.squeeze(-1)

rag_logprobs = self.marginalize(seq_logits, doc_scores, n_docs)

target = target.unsqueeze(-1)
assert target.dim() == rag_logprobs.dim()

# ll = rag_logprobs.gather_nd(dim=-1, index=target)

ll = ops.gather(rag_logprobs,dim=-1, index=target)
smooth_obj = rag_logprobs.sum(axis=-1, keepdims=True) # total sum of all (normalised) logits

ll, smooth_obj = _mask_pads(ll, smooth_obj)
ll = ll.sum(1) # sum over tokens
smooth_obj = smooth_obj.sum(1)

nll_loss = -ll
smooth_loss = -smooth_obj

if reduce_loss:
nll_loss = nll_loss.sum()
smooth_loss = smooth_loss.sum()

eps_i = epsilon / rag_logprobs.shape[-1]
loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
return loss


__all__ = [
"RagModel",
"RagPreTrainedModel",
"RagSequenceForGeneration",
"RagTokenForGeneration",
]

+ 667
- 0
mindnlp/transformers/models/rag/retrieval_rag.py View File

@@ -0,0 +1,667 @@
# coding=utf-8
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAG Retriever model implementation."""

import os
import pickle
import time
from typing import Iterable, List, Optional, Tuple

import numpy as np

from mindnlp.utils import cached_file, logging, requires_backends, strtobool
from .configuration_rag import RagConfig
from .tokenization_rag import RagTokenizer
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import BatchEncoding


def is_datasets_available():
try:
import datasets # pylint: disable=unused-import
return True
except ImportError:
raise ImportError("datasets has not been installed. Please install datasets first.")


def is_faiss_available():
try:
import faiss # pylint: disable=unused-import
return True
except ImportError:
raise ImportError("faiss has not been installed. Please install faiss first.")


if is_datasets_available():
from datasets import Dataset, load_dataset, load_from_disk

if is_faiss_available():
import faiss

logger = logging.get_logger(__name__)

LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/"


class Index:
"""
A base class for the Indices encapsulated by the [`RagRetriever`].
"""

def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
"""
Returns a list of dictionaries, containing titles and text of the retrieved documents.

Args:
doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
A tensor of document indices.
"""
raise NotImplementedError

def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
"""
For each query in the batch, retrieves `n_docs` documents.

Args:
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
An array of query vectors.
n_docs (`int`):
The number of docs retrieved per query.

Returns:
`np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents. `np.ndarray` of
shape `(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
"""
raise NotImplementedError

def is_initialized(self):
"""
Returns `True` if index is already initialized.
"""
raise NotImplementedError

def init_index(self):
"""
A function responsible for loading the index into memory. Should be called only once per training run of a RAG
model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
the index.
"""
raise NotImplementedError


class LegacyIndex(Index):
"""
An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
default faiss index parameters as specified in that repository.

Args:
vector_size (`int`):
The dimension of indexed vectors.
index_path (`str`):
A path to a *directory* containing index files compatible with [`~models.rag.retrieval_rag.LegacyIndex`]
"""

INDEX_FILENAME = "hf_bert_base.hnswSQ8_correct_phi_128.c_index"
PASSAGE_FILENAME = "psgs_w100.tsv.pkl"

def __init__(self, vector_size, index_path):
self.index_id_to_db_id = []
self.index_path = index_path
self.passages = self._load_passages()
self.vector_size = vector_size
self.index = None
self._index_initialized = False

def _resolve_path(self, index_path, filename):
is_local = os.path.isdir(index_path)
try:
# Load from URL or cache if already cached
resolved_archive_file = cached_file(index_path, filename)
except EnvironmentError:
msg = (
f"Can't load '{filename}'. Make sure that:\n\n"
f"- '{index_path}' is a correct remote path to a directory containing a file named {filename}\n\n"
f"- or '{index_path}' is the correct path to a directory containing a file named {filename}.\n\n"
)
raise EnvironmentError(msg)
if is_local:
logger.info(f"loading file {resolved_archive_file}")
else:
logger.info(f"loading file {filename} from cache at {resolved_archive_file}")
return resolved_archive_file

def _load_passages(self):
logger.info(f"Loading passages from {self.index_path}")
passages_path = self._resolve_path(self.index_path, self.PASSAGE_FILENAME)
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(passages_path, "rb") as passages_file:
passages = pickle.load(passages_file)
return passages

def _deserialize_index(self):
logger.info(f"Loading index from {self.index_path}")
resolved_index_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index.dpr")
self.index = faiss.read_index(resolved_index_path)
resolved_meta_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index_meta.dpr")
if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
raise ValueError(
"This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
"malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
"that could have been tampered with. If you already verified the pickle data and decided to use it, "
"you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
)
with open(resolved_meta_path, "rb") as metadata_file:
self.index_id_to_db_id = pickle.load(metadata_file)
assert (
len(self.index_id_to_db_id) == self.index.ntotal
), "Deserialized index_id_to_db_id should match faiss index size"

def is_initialized(self):
return self._index_initialized

def init_index(self):
index = faiss.IndexHNSWFlat(self.vector_size + 1, 512)
index.hnsw.efSearch = 128
index.hnsw.efConstruction = 200
self.index = index
self._deserialize_index()
self._index_initialized = True

def get_doc_dicts(self, doc_ids: np.array):
doc_list = []
for doc_ids_i in doc_ids:
ids = [str(int(doc_id)) for doc_id in doc_ids_i]
docs = [self.passages[doc_id] for doc_id in ids]
doc_list.append(docs)
doc_dicts = []
for docs in doc_list:
doc_dict = {}
doc_dict["title"] = [doc[1] for doc in docs]
doc_dict["text"] = [doc[0] for doc in docs]
doc_dicts.append(doc_dict)
return doc_dicts

def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
aux_dim = np.zeros(len(question_hidden_states), dtype="float32").reshape(-1, 1)
query_nhsw_vectors = np.hstack((question_hidden_states, aux_dim))
_, docs_ids = self.index.search(query_nhsw_vectors, n_docs)
vectors = [[self.index.reconstruct(int(doc_id))[:-1] for doc_id in doc_ids] for doc_ids in docs_ids]
ids = [[int(self.index_id_to_db_id[doc_id]) for doc_id in doc_ids] for doc_ids in docs_ids]
return np.array(ids), np.array(vectors)


class HFIndexBase(Index):
def __init__(self, vector_size, dataset, index_initialized=False):
self.vector_size = vector_size
self.dataset = dataset
self._index_initialized = index_initialized
self._check_dataset_format(with_index=index_initialized)
dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True, dtype="float32")

def _check_dataset_format(self, with_index: bool):
if not isinstance(self.dataset, Dataset):
raise TypeError(f"Dataset should be a datasets.Dataset object, but got {type(self.dataset)}")
if len({"title", "text", "embeddings"} - set(self.dataset.column_names)) > 0:
raise ValueError(
"Dataset should be a dataset with the following columns: "
"title (str), text (str) and embeddings (arrays of dimension vector_size), "
f"but got columns {self.dataset.column_names}"
)
if with_index and "embeddings" not in self.dataset.list_indexes():
raise ValueError(
"Missing faiss index in the dataset. Make sure you called `dataset.add_faiss_index` to compute it "
"or `dataset.load_faiss_index` to load one from the disk."
)

def init_index(self):
raise NotImplementedError()

def is_initialized(self):
return self._index_initialized

def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
return [self.dataset[doc_ids[i].tolist()] for i in range(doc_ids.shape[0])]

def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
_, ids = self.dataset.search_batch("embeddings", question_hidden_states, n_docs)
docs = [self.dataset[[i for i in indices if i >= 0]] for indices in ids]
vectors = [doc["embeddings"] for doc in docs]
for i in range(len(vectors)):
if len(vectors[i]) < n_docs:
vectors[i] = np.vstack([vectors[i], np.zeros((n_docs - len(vectors[i]), self.vector_size))])
return np.array(ids), np.array(vectors) # shapes (batch_size, n_docs) and (batch_size, n_docs, d)


class CanonicalHFIndex(HFIndexBase):
"""
A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the pre-computed
index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from the indicated path
on disk.

Args:
vector_size (`int`): the dimension of the passages embeddings used by the index
dataset_name (`str`, optional, defaults to `wiki_dpr`):
A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with `datasets.list_datasets()`).
dataset_split (`str`, optional, defaults to `train`)
Which split of the `dataset` to load.
index_name (`str`, optional, defaults to `train`)
The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be saved
under this name.
index_path (`str`, optional, defaults to `None`)
The path to the serialized faiss index on disk.
use_dummy_dataset (`bool`, optional, defaults to `False`):
If True, use the dummy configuration of the dataset for tests.
"""

def __init__(
self,
vector_size: int,
dataset_name: str = "wiki_dpr",
dataset_split: str = "train",
index_name: Optional[str] = None,
index_path: Optional[str] = None,
use_dummy_dataset=False,
dataset_revision=None,
):
if int(index_path is None) + int(index_name is None) != 1:
raise ValueError("Please provide `index_name` or `index_path`.")
self.dataset_name = dataset_name
self.dataset_split = dataset_split
self.index_name = index_name
self.index_path = index_path
self.use_dummy_dataset = use_dummy_dataset
self.dataset_revision = dataset_revision
logger.info(f"Loading passages from {self.dataset_name}")
dataset = load_dataset(
self.dataset_name,
with_index=False,
split=self.dataset_split,
dummy=self.use_dummy_dataset,
revision=dataset_revision,
trust_remote_code=True
)
super().__init__(vector_size, dataset, index_initialized=False)

def init_index(self):
if self.index_path is not None:
logger.info(f"Loading index from {self.index_path}")
self.dataset.load_faiss_index("embeddings", file=self.index_path)
else:
logger.info(f"Loading index from {self.dataset_name} with index name {self.index_name}")
self.dataset = load_dataset(
self.dataset_name,
with_embeddings=True,
with_index=True,
split=self.dataset_split,
index_name=self.index_name,
dummy=self.use_dummy_dataset,
revision=self.dataset_revision,
trust_remote_code=True
)
self.dataset.set_format("numpy", columns=["embeddings"], output_all_columns=True)
self._index_initialized = True


class CustomHFIndex(HFIndexBase):
"""
A wrapper around an instance of [`~datasets.Datasets`]. The dataset and the index are both loaded from the
indicated paths on disk.

Args:
vector_size (`int`): the dimension of the passages embeddings used by the index
dataset_path (`str`):
The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
embeddings (arrays of dimension vector_size)
index_path (`str`)
The path to the serialized faiss index on disk.
"""

def __init__(self, vector_size: int, dataset, index_path=None):
super().__init__(vector_size, dataset, index_initialized=index_path is None)
self.index_path = index_path

@classmethod
def load_from_disk(cls, vector_size, dataset_path, index_path):
logger.info(f"Loading passages from {dataset_path}")
if dataset_path is None or index_path is None:
raise ValueError(
"Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` "
"and `dataset.get_index('embeddings').save(index_path)`."
)
dataset = load_from_disk(dataset_path)
return cls(vector_size=vector_size, dataset=dataset, index_path=index_path)

def init_index(self):
if not self.is_initialized():
logger.info(f"Loading index from {self.index_path}")
self.dataset.load_faiss_index("embeddings", file=self.index_path)
self._index_initialized = True


class RagRetriever:
"""
Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
contents, and it formats them to be used with a RagModel.

Args:
config ([`RagConfig`]):
The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
`Index` to build. You can load your own custom dataset with `config.index_name="custom"` or use a canonical
one (default) from the datasets library with `config.index_name="wiki_dpr"` for example.
question_encoder_tokenizer ([`PreTrainedTokenizer`]):
The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
generator_tokenizer.
generator_tokenizer ([`PreTrainedTokenizer`]):
The tokenizer used for the generator part of the RagModel.
index ([`~models.rag.retrieval_rag.Index`], optional, defaults to the one defined by the configuration):
If specified, use this index instead of the one built using the configuration

"""

def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
self._init_retrieval = init_retrieval
if not is_faiss_available():
raise ImportError(
"faiss have not been installed. Please install it with `pip install faiss_gpu` or `pip install faiss_cpu`.")
requires_backends(self, ["datasets"])
super().__init__()
self.index = index or self._build_index(config)
self.generator_tokenizer = generator_tokenizer
self.question_encoder_tokenizer = question_encoder_tokenizer

self.n_docs = config.n_docs
self.batch_size = config.retrieval_batch_size

self.config = config
if self._init_retrieval:
self.init_retrieval()

self.ctx_encoder_tokenizer = None
self.return_tokenized_docs = False

@staticmethod
def _build_index(config):
if config.index_name == "legacy":
return LegacyIndex(
config.retrieval_vector_size,
config.index_path or LEGACY_INDEX_PATH,
)
elif config.index_name == "custom":
return CustomHFIndex.load_from_disk(
vector_size=config.retrieval_vector_size,
dataset_path=config.passages_path,
index_path=config.index_path,
)
else:
return CanonicalHFIndex(
vector_size=config.retrieval_vector_size,
dataset_name=config.dataset,
dataset_split=config.dataset_split,
index_name=config.index_name,
index_path=config.index_path,
use_dummy_dataset=config.use_dummy_dataset,
dataset_revision=config.dataset_revision,
)

@classmethod
def from_pretrained(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
if not is_faiss_available():
raise ImportError(
"faiss have not been installed. Please install it with `pip install faiss_gpu` or `pip install faiss_cpu`.")
requires_backends(cls, ["datasets"])
config = kwargs.pop("config", None) or RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
question_encoder_tokenizer = rag_tokenizer.question_encoder
generator_tokenizer = rag_tokenizer.generator
if indexed_dataset is not None:
config.index_name = "custom"
index = CustomHFIndex(config.retrieval_vector_size, indexed_dataset)
else:
index = cls._build_index(config)
return cls(
config,
question_encoder_tokenizer=question_encoder_tokenizer,
generator_tokenizer=generator_tokenizer,
index=index,
)

def save_pretrained(self, save_directory):
if isinstance(self.index, CustomHFIndex):
if self.config.index_path is None:
index_path = os.path.join(save_directory, "hf_dataset_index.faiss")
self.index.dataset.get_index("embeddings").save(index_path)
self.config.index_path = index_path
if self.config.passages_path is None:
passages_path = os.path.join(save_directory, "hf_dataset")
# datasets don't support save_to_disk with indexes right now
faiss_index = self.index.dataset._indexes.pop("embeddings")
self.index.dataset.save_to_disk(passages_path)
self.index.dataset._indexes["embeddings"] = faiss_index
self.config.passages_path = passages_path
self.config.save_pretrained(save_directory)
rag_tokenizer = RagTokenizer(
question_encoder=self.question_encoder_tokenizer,
generator=self.generator_tokenizer,
)
rag_tokenizer.save_pretrained(save_directory)

def init_retrieval(self):
"""
Retriever initialization function. It loads the index into memory.
"""

logger.info("initializing retrieval")
self.index.init_index()

def postprocess_docs(self, docs, input_strings, prefix, n_docs, return_tensors=None):
r"""
Postprocessing retrieved `docs` and combining them with `input_strings`.

Args:
docs (`dict`):
Retrieved documents.
input_strings (`str`):
Input strings decoded by `preprocess_query`.
prefix (`str`):
Prefix added at the beginning of each input, typically used with T5-based models.

Return:
`tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
`attention_mask`.
"""

def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
# TODO(Patrick): if we train more RAG models, I want to put the input first to take advantage of effortless truncation
# TODO(piktus): better handling of truncation
if doc_title.startswith('"'):
doc_title = doc_title[1:]
if doc_title.endswith('"'):
doc_title = doc_title[:-1]
if prefix is None:
prefix = ""
out = (prefix + doc_title + self.config.title_sep + doc_text + self.config.doc_sep + input_string).replace(
" ", " "
)
return out

rag_input_strings = [
cat_input_and_doc(
docs[i]["title"][j],
docs[i]["text"][j],
input_strings[i],
prefix,
)
for i in range(len(docs))
for j in range(n_docs)
]

contextualized_inputs = self.generator_tokenizer.batch_encode_plus(
rag_input_strings,
max_length=self.config.max_combined_length,
return_tensors=return_tensors,
padding="max_length",
truncation=True,
)

return contextualized_inputs["input_ids"], contextualized_inputs["attention_mask"]

def _chunk_tensor(self, t: Iterable, chunk_size: int) -> List[Iterable]:
return [t[i: i + chunk_size] for i in range(0, len(t), chunk_size)]

def _main_retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, np.ndarray]:
question_hidden_states_batched = self._chunk_tensor(question_hidden_states, self.batch_size)
ids_batched = []
vectors_batched = []
for question_hidden_states in question_hidden_states_batched:
start_time = time.time()
ids, vectors = self.index.get_top_docs(question_hidden_states, n_docs)
logger.debug(
f"index search time: {time.time() - start_time} sec, batch size {question_hidden_states.shape}"
)
ids_batched.extend(ids)
vectors_batched.extend(vectors)
return (
np.array(ids_batched),
np.array(vectors_batched),
) # shapes (batch_size, n_docs) and (batch_size, n_docs, d)

def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
"""
Retrieves documents for specified `question_hidden_states`.

Args:
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
A batch of query vectors to retrieve with.
n_docs (`int`):
The number of docs retrieved per query.

Return:
`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:

- **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- The retrieval embeddings
of the retrieved docs per query.
- **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- The ids of the documents in the index
- **doc_dicts** (`List[dict]`): The `retrieved_doc_embeds` examples per query.
"""

doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids)

def set_ctx_encoder_tokenizer(self, ctx_encoder_tokenizer: PreTrainedTokenizer):
# used in end2end retriever training
self.ctx_encoder_tokenizer = ctx_encoder_tokenizer
self.return_tokenized_docs = True

def __call__(
self,
question_input_ids: List[List[int]],
question_hidden_states: np.ndarray,
prefix=None,
n_docs=None,
return_tensors=None,
) -> BatchEncoding:
"""
Retrieves documents for specified `question_hidden_states`.

Args:
question_input_ids (`List[List[int]]`) batch of input ids
question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
A batch of query vectors to retrieve with.
prefix (`str`, *optional*):
The prefix used by the generator's tokenizer.
n_docs (`int`, *optional*):
The number of docs retrieved per query.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.

Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

- **context_input_ids** -- List of token ids to be fed to a model.

[What are input IDs?](../glossary#input-ids)

- **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
(when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).

[What are attention masks?](../glossary#attention-mask)

- **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
- **doc_ids** -- List of ids of the retrieved documents
"""

n_docs = n_docs if n_docs is not None else self.n_docs
prefix = prefix if prefix is not None else self.config.generator.prefix
retrieved_doc_embeds, doc_ids, docs = self.retrieve(question_hidden_states, n_docs)

input_strings = self.question_encoder_tokenizer.batch_decode(question_input_ids, skip_special_tokens=True)
context_input_ids, context_attention_mask = self.postprocess_docs(
docs, input_strings, prefix, n_docs, return_tensors=return_tensors
)

if self.return_tokenized_docs:
retrieved_doc_text = []
retrieved_doc_title = []

for b_idx in range(len(docs)):
for doc_idx in range(n_docs):
retrieved_doc_text.append(docs[b_idx]["text"][doc_idx])
retrieved_doc_title.append(docs[b_idx]["title"][doc_idx])

tokenized_docs = self.ctx_encoder_tokenizer(
retrieved_doc_title,
retrieved_doc_text,
truncation=True,
padding="longest",
return_tensors=return_tensors,
)

return BatchEncoding(
{
"context_input_ids": context_input_ids,
"context_attention_mask": context_attention_mask,
"retrieved_doc_embeds": retrieved_doc_embeds,
"doc_ids": doc_ids,
"tokenized_doc_ids": tokenized_docs["input_ids"],
"tokenized_doc_attention_mask": tokenized_docs["attention_mask"],
},
tensor_type=return_tensors,
)

else:
return BatchEncoding(
{
"context_input_ids": context_input_ids,
"context_attention_mask": context_attention_mask,
"retrieved_doc_embeds": retrieved_doc_embeds,
"doc_ids": doc_ids,
},
tensor_type=return_tensors,
)


__all__ = [
"RagRetriever"
]

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save
Baidu
map