Name: python-tokenizers Version: 0.22.1 Release: %autorelease Summary: Implementation of today's most used tokenizers # Generated license info from Rust dependencies # # (MIT OR Apache-2.0) AND Unicode-DFS-2016 # Apache-2.0 # Apache-2.0 AND MIT # Apache-2.0 OR BSL-1.0 # Apache-2.0 OR MIT # Apache-2.0 OR MIT OR Zlib # BSD-2-Clause # BSD-2-Clause OR Apache-2.0 OR MIT # MIT # MIT OR Apache-2.0 # Unlicense OR MIT # License expression simplified by eliminatation, as permitted %define license_expression %{shrink: Unicode-DFS-2016 AND Apache-2.0 AND (Apache-2.0 OR BSL-1.0) AND (Apache-2.0 OR MIT OR Zlib) AND BSD-2-Clause AND MIT AND (Unlicense OR MIT) } SourceLicense: Apache-2.0 License: %{license_expression} URL: https://github.com/huggingface/tokenizers Source: %{pypi_source tokenizers} BuildRequires: python3-devel BuildRequires: cargo-rpm-macros >= 24 BuildRequires: tomcli # TODO: For some reason the generated buildrequires don't catch this? BuildRequires: crate(tempfile/default) # For tests BuildRequires: python3dist(datasets) BuildRequires: python3dist(numpy) BuildRequires: python3dist(pytest) BuildRequires: python3dist(pytest-asyncio) BuildRequires: python3dist(pytest-datadir) BuildRequires: python3dist(pytest-xdist) # Fill in the actual package description to submit package to Fedora %global _description %{expand: Provides an implementation of today's most used tokenizers, with a focus on performance and versatility. Bindings over the rust-tokenizers implementation.} %description %_description %package -n python3-tokenizers Summary: %{summary} %description -n python3-tokenizers %_description %prep %autosetup -p1 -n tokenizers-%{version} %cargo_prep # Copy out LICENSE cp -a tokenizers/LICENSE LICENSE # Remove vendored tokenizers rm -r tokenizers/ # Remove locked versions rm bindings/python/Cargo.lock # Replace the path-based dependency on the bundled crate with an exact-version # dependency. tomcli set bindings/python/Cargo.toml del dependencies.tokenizers.path tomcli set bindings/python/Cargo.toml str dependencies.tokenizers.version '=%{version}' %generate_buildrequires # Get the cargo buildrequires first, so that maturin will succeed cd bindings/python/ %cargo_generate_buildrequires cd ../../ %pyproject_buildrequires %build # Generate the dependency license file first, so maturin will find it cd bindings/python/ %cargo_license_summary %{cargo_license} > LICENSE.dependencies cd ../../ %pyproject_wheel %install %pyproject_install # When saving the files, assert that a license file was found %pyproject_save_files -l tokenizers %check %pyproject_check_import cd bindings/python # TODO: The cargo tests for the bindings fail to link to Python for reasons I don't understand #cargo_test # only run the tests, not the benches # the deselected tests are: # - test_datasets fails with: "TypeError: Pickler._batch_setitems() takes 2 positional arguments but 3 were given" # - test_gzip fails with: "FileNotFoundError: [Errno 2] No such file or directory: 'data/my-file.0.gz'" %pytest -s -v ./tests/ \ --deselect="tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets" \ --deselect="tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip" cd ../../ %files -n python3-tokenizers -f %{pyproject_files} %doc bindings/python/README.md bindings/python/CHANGELOG.md %changelog %autochangelog