Instructions to use Susav/PolarSparsity with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Susav/PolarSparsity with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Susav/PolarSparsity")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Susav/PolarSparsity", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Susav/PolarSparsity with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Susav/PolarSparsity" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Susav/PolarSparsity", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/Susav/PolarSparsity
- SGLang
How to use Susav/PolarSparsity with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Susav/PolarSparsity" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Susav/PolarSparsity", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Susav/PolarSparsity" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Susav/PolarSparsity", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use Susav/PolarSparsity with Docker Model Runner:
docker model run hf.co/Susav/PolarSparsity
| # python run_sparse_mlp.py --in_features 8192 --batch_size 16 --index_size 8192 | |
| import torch | |
| from HybridTensor.modules.SelectiveMLP import SelectiveMLP | |
| from HybridTensor.utils.profiling import cuda_profiler | |
| from HybridTensor.utils.utils import arg_parser, sparse_index | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| # standard MLP implementation | |
| class Mlp(nn.Module): | |
| def __init__( | |
| self, | |
| in_features, | |
| hidden_features=None, | |
| out_features=None, | |
| activation=F.gelu, | |
| bias1=True, | |
| bias2=True, | |
| return_residual=False, | |
| device=None, | |
| dtype=None, | |
| ): | |
| factory_kwargs = {"device": device, "dtype": dtype} | |
| super().__init__() | |
| out_features = out_features if out_features is not None else in_features | |
| hidden_features = hidden_features if hidden_features is not None else in_features * 4 | |
| self.return_residual = return_residual | |
| self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs) | |
| self.activation = activation | |
| self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs) | |
| def forward(self, x): | |
| y = self.fc1(x) | |
| y = self.activation(y) | |
| y = self.fc2(y) | |
| return y if not self.return_residual else (y, x) | |
| if __name__ == "__main__": | |
| args = arg_parser() | |
| bias = True if args.bias > 0 else False | |
| in_features = args.in_features | |
| hidden_features = in_features * 4 | |
| out_features = in_features | |
| activation="relu" | |
| device = torch.device("cuda") | |
| sparse_mlp = SelectiveMLP( | |
| in_features, hidden_features, out_features, activation="relu", use_heuristic=False, bias1=bias, bias2=bias, device=device, dtype=torch.float16 | |
| ) | |
| activation_fn = F.relu if activation == "relu" else F.gelu | |
| dense_mlp = Mlp(in_features=in_features, hidden_features=hidden_features, out_features=out_features, bias1=bias, bias2=bias, activation=activation_fn, device=device, dtype=torch.float16) | |
| # Create random input tensor | |
| x = torch.randn(args.batch_size, args.in_features, device="cuda", dtype=torch.float16) | |
| index_vec, _ = sparse_index(args.index_size, args.in_features*4) | |
| # dense mlp time | |
| dense_mlp_out, dense_mlp_time = cuda_profiler(dense_mlp, x) | |
| # sparse mlp time | |
| sparse_mlp_out, sparse_mlp_time = cuda_profiler(sparse_mlp, x, index_vec) | |
| print(f"Dense MLP time: {dense_mlp_time:.4f} ms") | |
| print(f"Sparse MLP time: {sparse_mlp_time:.4f} ms") | |
| print(f"Speedup: {dense_mlp_time/sparse_mlp_time:.2f}x") | |