{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "-DBXBd1Q6SFF" }, "outputs": [], "source": [ "import requests\n", "from typing import List, Dict, Any, Iterator\n", "\n", "class DatasetSearchClient:\n", " def __init__(self, base_url: str = \"https://librarian-bots-dataset-column-search-api.hf.space\"):\n", " self.base_url = base_url\n", "\n", " def search(self,\n", " columns: List[str],\n", " match_all: bool = False,\n", " page_size: int = 100) -> Iterator[Dict[str, Any]]:\n", " \"\"\"\n", " Search datasets using the provided API, automatically handling pagination.\n", "\n", " Args:\n", " columns (List[str]): List of column names to search for.\n", " match_all (bool, optional): If True, match all columns. If False, match any column. Defaults to False.\n", " page_size (int, optional): Number of results per page. Defaults to 100.\n", "\n", " Yields:\n", " Dict[str, Any]: Each dataset result from all pages.\n", "\n", " Raises:\n", " requests.RequestException: If there's an error with the HTTP request.\n", " ValueError: If the API returns an unexpected response format.\n", " \"\"\"\n", " page = 1\n", " total_results = None\n", "\n", " while total_results is None or (page - 1) * page_size < total_results:\n", " params = {\n", " \"columns\": columns,\n", " \"match_all\": str(match_all).lower(),\n", " \"page\": page,\n", " \"page_size\": page_size\n", " }\n", "\n", " try:\n", " response = requests.get(f\"{self.base_url}/search\", params=params)\n", " response.raise_for_status()\n", " data = response.json()\n", "\n", " if not {\"total\", \"page\", \"page_size\", \"results\"}.issubset(data.keys()):\n", " raise ValueError(\"Unexpected response format from the API\")\n", "\n", " if total_results is None:\n", " total_results = data['total']\n", "\n", " for dataset in data['results']:\n", " yield dataset\n", "\n", " page += 1\n", "\n", " except requests.RequestException as e:\n", " raise requests.RequestException(f\"Error connecting to the API: {str(e)}\")\n", " except ValueError as e:\n", " raise ValueError(f\"Error processing API response: {str(e)}\")\n", "\n", "# Create an instance of the client\n", "client = DatasetSearchClient()" ] }, { "cell_type": "code", "source": [ "results = list(client.search(['tools'],match_all=True))\n", "len(results)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9yupgFYx6Sqx", "outputId": "ac6d7c15-2267-4bbd-ceaa-1d98faee188b" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "38" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "results[0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "atL-PQq76VrV", "outputId": "f357fe16-a1f9-4bb2-ca3d-767f3ac6508d" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'hub_id': 'llamafactory/glaive_toolcall_en',\n", " 'likes': 1,\n", " 'downloads': 1151,\n", " 'tags': ['task_categories:text-generation',\n", " 'task_categories:question-answering',\n", " 'language:en',\n", " 'license:apache-2.0',\n", " 'size_categories:1K<n<10K',\n", " 'json',\n", " 'text',\n", " 'datasets',\n", " 'mlcroissant',\n", " 'region:us',\n", " 'llama-factory',\n", " 'croissant'],\n", " 'created_at': 1715955540,\n", " 'last_modified': 1717785919,\n", " 'license': ['apache-2.0'],\n", " 'language': ['en'],\n", " 'config_name': 'default',\n", " 'column_names': ['conversations', 'tools'],\n", " 'features': [{'name': 'conversations',\n", " 'list': [{'name': 'from', 'dtype': 'string'},\n", " {'name': 'value', 'dtype': 'string'}]},\n", " {'name': 'tools', 'dtype': 'string'}],\n", " 'match_count': 1}" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "from huggingface_hub import create_collection, add_collection_item" ], "metadata": { "id": "pXKtgF3r7GSK" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "collection = create_collection(\"Probably function calling datasets\", namespace=\"librarian-bots\",)" ], "metadata": { "id": "MzkGofqF7M0i" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "collection.slug" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "rAGoahvb7Ucp", "outputId": "c5f7b158-85cb-49be-903f-7caaa98f7b74" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'librarian-bots/probably-function-calling-datasets-6683d24da13a7bb7efee7464'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "for item in results:\n", " add_collection_item(collection.slug, item['hub_id'], item_type=\"dataset\")" ], "metadata": { "id": "LR6nJyCL7ZZK" }, "execution_count": 13, "outputs": [] } ] }