Spaces:

khulnasoft
/

HuggingSpaces

Runtime error

App Files Files Community

HuggingSpaces / get_token_ids.py

khulnasoft

Create get_token_ids.py

c37b750 verified 6 months ago

raw

history blame

3.52 kB

	import tiktoken

	# Mapping of model names to their respective encodings
	ENCODINGS = {
	"gpt-4": tiktoken.get_encoding("cl100k_base"),
	"gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
	"gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
	"text-davinci-003": tiktoken.get_encoding("p50k_base"),
	"text-davinci-002": tiktoken.get_encoding("p50k_base"),
	"text-davinci-001": tiktoken.get_encoding("r50k_base"),
	"text-curie-001": tiktoken.get_encoding("r50k_base"),
	"text-babbage-001": tiktoken.get_encoding("r50k_base"),
	"text-ada-001": tiktoken.get_encoding("r50k_base"),
	"davinci": tiktoken.get_encoding("r50k_base"),
	"curie": tiktoken.get_encoding("r50k_base"),
	"babbage": tiktoken.get_encoding("r50k_base"),
	"ada": tiktoken.get_encoding("r50k_base"),
	}

	# Mapping of model names to their respective maximum context lengths
	MAX_LENGTH = {
	"gpt-4": 8192,
	"gpt-3.5-turbo": 4096,
	"gpt-3.5-turbo-0301": 4096,
	"text-davinci-003": 4096,
	"text-davinci-002": 4096,
	"text-davinci-001": 2049,
	"text-curie-001": 2049,
	"text-babbage-001": 2049,
	"text-ada-001": 2049,
	"davinci": 2049,
	"curie": 2049,
	"babbage": 2049,
	"ada": 2049
	}

	def count_tokens(model_name, text):
	"""
	Count the number of tokens for a given model and text.

	Parameters:
	- model_name (str): The name of the model.
	- text (str): The input text.

	Returns:
	- int: The number of tokens.
	"""
	if model_name not in ENCODINGS:
	raise ValueError(f"Model name '{model_name}' not found in encodings.")
	return len(ENCODINGS[model_name].encode(text))

	def get_max_context_length(model_name):
	"""
	Get the maximum context length for a given model.

	Parameters:
	- model_name (str): The name of the model.

	Returns:
	- int: The maximum context length.
	"""
	if model_name not in MAX_LENGTH:
	raise ValueError(f"Model name '{model_name}' not found in max length dictionary.")
	return MAX_LENGTH[model_name]

	def get_token_ids_for_text(model_name, text):
	"""
	Get unique token IDs for a given text using the specified model's encoding.

	Parameters:
	- model_name (str): The name of the model.
	- text (str): The input text.

	Returns:
	- list: A list of unique token IDs.
	"""
	if model_name not in ENCODINGS:
	raise ValueError(f"Model name '{model_name}' not found in encodings.")
	encoded_tokens = ENCODINGS[model_name].encode(text)
	return list(set(encoded_tokens))

	def get_token_ids_for_task_parsing(model_name):
	"""
	Get unique token IDs for task parsing.

	Parameters:
	- model_name (str): The name of the model.

	Returns:
	- list: A list of unique token IDs for task parsing.
	"""
	text = '''{"task": "text-classification", "token-classification", "text2text-generation", "summarization", "translation", "question-answering", "conversational", "text-generation", "sentence-similarity", "tabular-classification", "object-detection", "image-classification", "image-to-image", "image-to-text", "text-to-image", "visual-question-answering", "document-question-answering", "image-segmentation", "text-to-speech", "text-to-video", "automatic-speech-recognition", "audio-to-audio", "audio-classification", "canny-control", "hed-control", "mlsd-control", "normal-control", "openpose-control", "canny-text-to-image", "depth-text-to-image", "hed-text-to-image", "mlsd-text-to-image", "normal-text-to-image", "openpose-text