H2OTest / llm_studio /src /possible_values.py
elineve's picture
Upload 301 files
07423df
import os
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any, Callable, List, Optional, Sequence, Set, Tuple, Union
from llm_studio.src.nesting import Dependency
def _scan_dirs(dirname) -> List[str]:
"""Scans a directory for subfolders
Args:
dirname: directory name
Returns:
List of subfolders
"""
subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
for dirname in list(subfolders):
subfolders.extend(_scan_dirs(dirname))
subfolders = [x + "/" if x[-1] != "/" else x for x in subfolders]
return subfolders
def _scan_files(
dirname, extensions: Tuple[str, ...] = (".csv", ".pq", ".parquet", ".json")
) -> List[str]:
"""Scans a directory for files with given extension
Args:
dirname: directory name
extensions: extensions to consider
Returns:
List of files
"""
path_list = [
os.path.join(dirpath, filename)
for dirpath, _, filenames in os.walk(dirname)
for filename in filenames
if any(map(filename.__contains__, extensions))
and not filename.startswith("__meta_info__")
]
return sorted(path_list)
def strip_prefix(paths: Sequence[str], ignore_set: Set[str] = set()) -> Tuple[str, ...]:
"""
Strips the common prefix of all the given paths.
Args:
paths: the paths to strip
ignore_set: set of path names to ignore when computing the prefix.
Returns:
List with the same length as `paths` without common prefixes.
"""
paths_to_check = [
os.path.split(os.path.normpath(path))[0]
for path in paths
if path not in ignore_set
]
if len(paths_to_check) == 0:
return tuple(paths)
prefix = os.path.commonpath(paths_to_check)
stripped = tuple(
[
path if path in ignore_set else os.path.relpath(path, prefix)
for path in paths
]
)
return stripped
class Value:
pass
@dataclass
class Number:
min: Optional[float] = None
max: Optional[float] = None
step: Union[str, float] = 1.0
@dataclass
class String:
# Each element of the tuple can be either:
# - a tuple of (value, name)
# - a string. In that case the same value will be used for name and value
values: Any = None
allow_custom: bool = False
placeholder: Optional[str] = None
class DatasetValue:
pass
@abstractmethod
def get_value(
self, dataset: Any, value: Any, type_annotation: type, mode: str
) -> Tuple[String, Any]:
pass
@staticmethod
def _compute_current_values(
current_values: List[str],
possible_values: List[str],
prefer_with: Optional[Callable[[str], bool]] = None,
) -> List[str]:
"""
Compute current values.
Args:
current_values: The preliminary current values.
possible_values: All possible values.
prefer_with: Function determining which values to prefer as default.
Returns:
A list
"""
if len(possible_values) == 0:
return [""]
# allow only values which are in the possible values
current_values = list(
filter(lambda value: value in possible_values, current_values)
)
if len(current_values) == 0:
# if the values are empty, take all the values where `prefer_with` is true
for c in possible_values:
if prefer_with is not None and prefer_with(c):
current_values.append(c)
# if they are still empty, just take the first possible value
if len(current_values) == 0:
current_values = [possible_values[0]]
return current_values
@dataclass
class Directories(DatasetValue):
add_none: Union[bool, Callable[[str], bool]] = False
prefer_with: Optional[Callable[[str], bool]] = None
prefer_none: bool = True
def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
if dataset is None:
return String(tuple()), value
available_dirs = _scan_dirs(dataset["path"])
if (isinstance(self.add_none, bool) and self.add_none) or (
callable(self.add_none) and self.add_none(mode)
):
if self.prefer_none:
available_dirs.insert(0, "None")
else:
available_dirs.insert(len(available_dirs), "None")
if isinstance(value, str):
value = [value]
value = DatasetValue._compute_current_values(
value, available_dirs, self.prefer_with
)
return (
String(
tuple(
zip(
available_dirs,
strip_prefix(available_dirs, ignore_set={"None"}),
)
)
),
value if type_annotation == Tuple[str, ...] else value[0],
)
@dataclass
class Files(DatasetValue):
add_none: Union[bool, Callable[[str], bool]] = False
prefer_with: Optional[Callable[[str], bool]] = None
# For the case where no match found, whether to prioritize
# selecting any file or selecting no file
prefer_none: bool = True
def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
if dataset is None:
return String(tuple()), value
available_files = _scan_files(dataset["path"])
if (isinstance(self.add_none, bool) and self.add_none) or (
callable(self.add_none) and self.add_none(mode)
):
if self.prefer_none:
available_files.insert(0, "None")
else:
available_files.insert(len(available_files), "None")
if isinstance(value, str):
value = [value]
value = DatasetValue._compute_current_values(
value, available_files, self.prefer_with
)
return (
String(
tuple(
zip(
available_files,
strip_prefix(available_files, ignore_set={"None"}),
)
)
),
value if type_annotation == Tuple[str, ...] else value[0],
)
@dataclass
class Columns(DatasetValue):
add_none: Union[bool, Callable[[str], bool]] = False
prefer_with: Optional[Callable[[str], bool]] = None
def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
if dataset is None:
return String(tuple()), value
try:
columns = list(dataset["dataframe"].columns)
except KeyError:
columns = []
if (isinstance(self.add_none, bool) and self.add_none) or (
callable(self.add_none) and self.add_none(mode)
):
columns.insert(0, "None")
if isinstance(value, str):
value = [value]
if value is None:
value = [columns[0]]
value = DatasetValue._compute_current_values(value, columns, self.prefer_with)
return (
String(tuple(columns)),
value if type_annotation == Tuple[str, ...] else value[0],
)
@dataclass
class ColumnValue(DatasetValue):
column: str
default: List[str]
prefer_with: Optional[Callable[[str], bool]] = None
dependency: Optional[Dependency] = None
def get_value(self, dataset, value, type_annotation, mode) -> Tuple[String, Any]:
if dataset is None:
return String(tuple()), value
try:
df = dataset["dataframe"]
except KeyError:
df = None
if df is not None:
if self.dependency is not None and not self.dependency.check(
[dataset[self.dependency.key]]
):
values = self.default
elif self.column in df:
values = [str(v) for v in sorted(list(df[self.column].unique()))]
else:
values = self.default
else:
values = self.default
value = DatasetValue._compute_current_values(value, values, self.prefer_with)
return (
String(tuple(values)),
value if type_annotation == Tuple[str, ...] else value[0],
)