I have tried this:
#############################################################################
from transformers import AutoTokenizer, pipeline, logging,AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse

quantized_model_dir = "/media/galaxy/guanaco/guanaco-65B-GPTQ"
model_basename = "Guanaco-65B-GPTQ-4bit.act-order"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)

quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False
)

model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
use_safetensors=True,
model_basename=model_basename,
device="cuda:0",
use_triton=use_triton,
quantize_config=quantize_config)

Prevent printing spurious transformers error when using pipeline with AutoGPTQ

logging.set_verbosity(logging.CRITICAL)

prompt = "Tell me about AI"
prompt_template=f'''### Human: {prompt}

Assistant:'''

print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tok,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.15
)

print(pipe(prompt_template)[0]['generated_text'])

print("\n\n*** Generate:")

input_ids = tok(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))

#############################################################################
but got an error :
Exception: data did not match any variant of untagged enum PyNormalizerTypeWrapper at line 49
column 3

I am using an 4x 40G station , any suggestion ? or it will be great if a demo python file could be provided ,thank you .

TheBloke
/

guanaco-65B-GPTQ

How to run it ?

Prevent printing spurious transformers error when using pipeline with AutoGPTQ

Assistant:'''