主流大模型测试程序-用于导出算子列表

需要多少算子才能覆盖主流大模型呢,于是 基于__torch_dispatch__机制的dump方法 dump出算子及参数列表,考虑到设备内存容量,设置为一层

一.参考链接

二.下载链接

三.测试程序

import warnings 
warnings.filterwarnings("ignore")
import copy
import sys
import torch
import multiprocessing as mp
from tqdm import tqdm

op_mapping={}
class llm_forward:
    def __init__(self,func):
        global op_mapping  
        op_mapping[func.__name__]=func
        self.func=func
         
    def __call__(self,*args,**kwargs):
        return self.func(*args,**kwargs)

try:
    from torch_hook import TorchDumper,TorchDumpDispatchMode
except:
  class TorchDumpDispatchMode:
      pass
  class TorchDumper:
      def __init__(self,*args,**kwargs):        
          pass
    
      def __enter__(self):
          pass
  
      def __exit__(self, exc_type, exc_val, exc_tb):
          pass

@llm_forward
def bert_base_chinese(use_half,device):
  from transformers import AutoModelForMaskedLM,BertConfig
  config=BertConfig.from_pretrained("bert_base_chinese/config.json")
  config.num_hidden_layers=1
  model = AutoModelForMaskedLM.from_config(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="bert_base_chinesee.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Baichuan2_13B_Chat(use_half,device):
  import sys
  sys.path.insert(0,"./Baichuan2_13B_Chat")
  from configuration_baichuan2 import BaichuanConfig
  from modeling_baichuan2 import BaichuanForCausalLM
  config=BaichuanConfig.from_pretrained("Baichuan2_13B_Chat/config.json")
  config.num_hidden_layers=1
  model = BaichuanForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.model_max_length//4))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Baichuan2_13B_Chat.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def baichuan_7B(use_half,device):
  import sys
  import os
  sys.path.insert(0,os.path.join(os.getcwd(),"baichuan_7B"))
  from configuration_baichuan import BaiChuanConfig
  from modeling_baichuan import BaiChuanForCausalLM
  config=BaiChuanConfig.from_pretrained("baichuan_7B/config.json")
  config.num_hidden_layers=1
  model = BaiChuanForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//4))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="baichuan_7B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def ChatGLM_6B(use_half,device):
  import sys
  sys.path.append("./ChatGLM_6B")
  from configuration_chatglm import ChatGLMConfig
  from modeling_chatglm import ChatGLMModel
  config=ChatGLMConfig.from_pretrained("ChatGLM_6B/config.json")
  config.num_layers=1
  model = ChatGLMModel(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_sequence_length))
  input_tokens[:,0]=config.bos_token_id
  input_tokens[:,2]=config.mask_token_id  
  input_tokens[:,-1]=config.eos_token_id
  with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM_6B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.last_hidden_state
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def ChatGLM2_6B(use_half,device):
  import sys
  sys.path.append("./ChatGLM2_6B")
  from configuration_chatglm import ChatGLMConfig
  from modeling_chatglm import ChatGLMModel
  config=ChatGLMConfig.from_pretrained("ChatGLM2_6B/config.json")
  config.num_layers=1
  model = ChatGLMModel(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.padded_vocab_size,(1,config.seq_length//10))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM2_6B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.last_hidden_state
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def ChatGLM3_6B(use_half,device):
  import sys
  sys.path.append("./ChatGLM3_6B")
  from configuration_chatglm import ChatGLMConfig
  from modeling_chatglm import ChatGLMModel
  config=ChatGLMConfig.from_pretrained("ChatGLM3_6B/config.json")
  config.num_layers=1
  model = ChatGLMModel(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.padded_vocab_size,(1,config.seq_length//4))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="ChatGLM3_6B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.last_hidden_state
    loss=logits.mean()-1.0
    loss.backward()
    
@llm_forward
def deepseek_moe_16b_chat(use_half,device):
  import sys
  sys.path.append("./deepseek_moe_16b_chat")
  from configuration_deepseek import DeepseekConfig
  from modeling_deepseek import DeepseekForCausalLM
  config=DeepseekConfig.from_pretrained("deepseek_moe_16b_chat/config.json")
  config.num_hidden_layers=1
  model = DeepseekForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="deepseek_moe_16b_chat.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def deepseek_coder_33b_base(use_half,device):
  from transformers.models.llama import LlamaForCausalLM, LlamaConfig
  config=LlamaConfig.from_pretrained("deepseek_coder_33b_base/config.json")
  config.num_hidden_layers=1
  model = LlamaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//10))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="deepseek_coder_33b_base.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def falcon_7b_instruct(use_half,device):
  import sys
  sys.path.append("./falcon_7b_instruct")
  from configuration_RW import RWConfig
  from modelling_RW import RWForCausalLM
  config=RWConfig.from_pretrained("falcon_7b_instruct/config.json")
  config.n_layer=1
  model = RWForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="falcon_7b_instruct.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def GPT2(use_half,device):
  from transformers import GPT2LMHeadModel, GPT2Config
  config=GPT2Config.from_pretrained("GPT2/config.json")
  config.n_layer=1
  model = GPT2LMHeadModel(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="GPT2.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def gemma_7b(use_half,device):
  import sys
  sys.path.append("./gemma_7b")
  from config import GemmaConfig
  from model import GemmaForCausalLM
  config=GemmaConfig.from_pretrained("gemma_7b/config.json")
  config.num_hidden_layers=1
  model = GemmaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)

  max_seq_len=512
  batch_size=1

  prompt_tokens=torch.randint(0,config.vocab_size,(batch_size,max_seq_len)).to(device)

  temperature= 0.95
  top_p  = 1.0
  top_k = 100
  
  # build KV caches
  kv_caches = []
  for _ in range(config.num_hidden_layers):
      size = (batch_size, max_seq_len, config.num_key_value_heads,
              config.head_dim)
      dtype = config.get_dtype()
      k_cache = torch.zeros(size=size, dtype=dtype).to(device)
      v_cache = torch.zeros(size=size, dtype=dtype).to(device)
      kv_caches.append((k_cache, v_cache))

  # prepare inputs
  input_token_ids_tensor = torch.full((batch_size, max_seq_len),
                                      0,
                                      dtype=torch.int64)
  input_token_ids_tensor = input_token_ids_tensor.to(device)
  input_positions_tensor = torch.arange(0, max_seq_len,
                                        dtype=torch.int64).to(device)
  mask_tensor = torch.full((1, 1, max_seq_len, max_seq_len),
                            -2.3819763e38).to(torch.float)
  mask_tensor = torch.triu(mask_tensor, diagonal=1).to(device)
  output_positions_tensor = torch.LongTensor([max_seq_len - 1]).to(device)
  temperatures_tensor = None if not temperature else torch.FloatTensor(
      [temperature] * batch_size).to(device)
  top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
  top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)

  with TorchDumper(TorchDumpDispatchMode,op_log_path="gemma_7b.pkl"):
    output=model(prompt_tokens,input_positions_tensor,
                  None,kv_caches,mask_tensor,output_positions_tensor,
                  temperatures_tensor,top_ps_tensor,top_ks_tensor)
    _,logits=output
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def grok1_pytorch(use_half,device):
  import sys
  sys.path.append("./grok1_pytorch")
  from configuration_grok1 import Grok1Config
  from modeling_grok1 import Grok1ModelForCausalLM
  config=Grok1Config.from_pretrained("grok1_pytorch/config.json")
  config.num_hidden_layers=1
  config.num_experts=1
  config.num_experts_per_tok=1
  model = Grok1ModelForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="grok1_pytorch.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def internLM(use_half,device):
  import sys
  sys.path.append("./internLM")
  from configuration_internlm import InternLMConfig
  from modeling_internlm import InternLMForCausalLM
  config=InternLMConfig.from_pretrained("internLM/config.json")
  config.num_hidden_layers=1
  model = InternLMForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="internLM.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def internlm2_20b(use_half,device):
  import sys
  sys.path.append("./internlm2_20b")
  from configuration_internlm2 import InternLM2Config
  from modeling_internlm2 import InternLM2ForCausalLM
  config=InternLM2Config.from_pretrained("internlm2_20b/config.json")
  config.num_hidden_layers=1
  model = InternLM2ForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="internlm2_20b.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def llama_13b(use_half,device):
  from transformers.models.llama import LlamaForCausalLM, LlamaConfig
  config=LlamaConfig.from_pretrained("llama_13b/config.json")
  config.num_hidden_layers=1
  model = LlamaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_sequence_length))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="llama_13b.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Llama2_13B_chat(use_half,device):
  from transformers.models.llama import LlamaForCausalLM, LlamaConfig
  config=LlamaConfig.from_pretrained("Llama2_13B_chat/config.json")
  config.num_hidden_layers=1
  model = LlamaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,128))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Llama2_13B_chat.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Llama3_8B_Chinese_Chat(use_half,device):
  from transformers.models.llama import LlamaForCausalLM, LlamaConfig
  config=LlamaConfig.from_pretrained("Llama3_8B_Chinese_Chat/config.json")
  config.num_hidden_layers=1
  model = LlamaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,128))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Llama3_8B_Chinese_Chat.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Mixtral_8x22B(use_half,device):
  import sys
  sys.path.append("./Mixtral_8x22B")
  from configuration_mixtral import MixtralConfig
  from modeling_mixtral import MixtralForCausalLM
  config=MixtralConfig.from_pretrained("Mixtral_8x22B/config.json")
  config.num_hidden_layers=1
  model = MixtralForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Mixtral_8x22B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def OLMo_7B(use_half,device):
  import sys
  sys.path.append("./OLMo_7B")
  from configuration_olmo import OLMoConfig
  from modeling_olmo import OLMoForCausalLM
  config=OLMoConfig.from_pretrained("OLMo_7B/config.json")
  config.n_layers=1
  model = OLMoForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="OLMo_7B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Phi3_mini_4k_instruct(use_half,device):
  import sys
  sys.path.append("./Phi3_mini_4k_instruct")
  from configuration_phi3 import Phi3Config
  from modeling_phi3 import Phi3ForCausalLM
  config=Phi3Config.from_pretrained("Phi3_mini_4k_instruct/config.json")
  config.num_hidden_layers=1
  model = Phi3ForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Phi3_mini_4k_instruct.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def OpenELM_3B(use_half,device):
  import sys
  sys.path.append("./OpenELM_3B")
  from configuration_openelm import OpenELMConfig
  from modeling_openelm import OpenELMForCausalLM
  config=OpenELMConfig.from_pretrained("OpenELM_3B/config.json")
  config.num_transformer_layers=1
  model = OpenELMForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="OpenELM_3B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Qwen_14B_Chat(use_half,device):
  import sys
  sys.path.append("./Qwen_14B_Chat")
  from configuration_qwen import QWenConfig
  from modeling_qwen import QWenLMHeadModel
  config=QWenConfig.from_pretrained("Qwen_14B_Chat/config.json")
  config.num_hidden_layers=1
  model = QWenLMHeadModel(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Qwen_14B_Chat.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Qwen1_5_7B(use_half,device):
  import sys
  sys.path.append("./Qwen1_5_7B")
  from configuration_qwen2 import Qwen2Config
  from modeling_qwen2 import Qwen2ForCausalLM
  config=Qwen2Config.from_pretrained("Qwen1_5_7B/config.json")
  config.num_hidden_layers=1
  model = Qwen2ForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Qwen1_5_7B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def t5_base(use_half,device):
  import sys
  sys.path.append("./t5_base")
  from transformers import T5Config, T5ForConditionalGeneration
  config=T5Config.from_pretrained("t5_base/config.json")
  config.num_layers=1
  config.max_new_tokens=512
  model = T5ForConditionalGeneration(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_new_tokens))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="t5_base.pkl"):
    output=model.generate(input_tokens.to(device))
    #logits=output
    #loss=logits.mean()-1.0
    #loss.backward()

@llm_forward
def XVERSE_7B(use_half,device):
  import sys
  sys.path.append("./XVERSE_7B")
  from configuration_xverse import XverseConfig
  from modeling_xverse import XverseForCausalLM
  config=XverseConfig.from_pretrained("XVERSE_7B/config.json")
  config.num_hidden_layers=1
  model = XverseForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,512))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="XVERSE_7B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Yi_34B(use_half,device):
  from transformers.models.llama import LlamaForCausalLM, LlamaConfig
  config=LlamaConfig.from_pretrained("Yi_34B/config.json")
  config.num_hidden_layers=1
  model = LlamaForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,config.max_position_embeddings//10))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Yi_34B.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

@llm_forward
def Yuan2_51B_hf(use_half,device):
  import sys
  sys.path.append("./Yuan2_51B_hf")
  from configuration_yuan import YuanConfig
  from yuan_hf_model import YuanForCausalLM
  config=YuanConfig.from_pretrained("Yuan2_51B_hf/config.json")
  config.num_hidden_layers=1
  config.intermediate_size=2048
  config.model_max_length=config.max_position_embeddings=2
  model = YuanForCausalLM(config)
  if use_half:
    model=model.half()
  model.train().to(device)
  input_tokens=torch.randint(0,config.vocab_size,(1,2))
  with TorchDumper(TorchDumpDispatchMode,op_log_path="Yuan2_51B_hf.pkl"):
    output=model(input_tokens.to(device))
    logits=output.logits
    loss=logits.mean()-1.0
    loss.backward()

def main():
    global op_mapping
    device="cuda"
    use_half=True
    pbar=tqdm(list(op_mapping.keys()))
    for name in pbar:        
        torch.manual_seed(1)
        p = mp.Process(target=op_mapping[name],args=(use_half,device))
        p.start()
        p.join()
        torch.cuda.empty_cache()
        pbar.set_description("%s" % (name))

if __name__=='__main__':
    main()

四.算子列表

05-03 21:45