## START: Set by rpmautospec ## (rpmautospec version 0.8.3) ## RPMAUTOSPEC: autorelease, autochangelog %define autorelease(e:s:pb:n) %{?-p:0.}%{lua: release_number = 1; base_release_number = tonumber(rpm.expand("%{?-b*}%{!?-b:1}")); print(release_number + base_release_number - 1); }%{?-e:.%{-e*}}%{?-s:.%{-s*}}%{!?-n:%{?dist}} ## END: Set by rpmautospec %bcond check 1 Name: python-tokenizers Version: 0.22.2 Release: %autorelease Summary: Implementation of today's most used tokenizers # pyarrow and pandas are not available on i686 # https://fedoraproject.org/wiki/Changes/EncourageI686LeafRemoval ExcludeArch: %{ix86} # Generated license info from Rust dependencies ### BEGIN LICENSE SUMMARY ### # # (MIT OR Apache-2.0) AND Unicode-DFS-2016 # Apache-2.0 # Apache-2.0 AND MIT # Apache-2.0 OR BSL-1.0 # Apache-2.0 OR MIT # Apache-2.0 OR MIT OR Zlib # BSD-2-Clause # BSD-2-Clause OR Apache-2.0 OR MIT # MIT # MIT OR Apache-2.0 # Unlicense OR MIT ### END LICENSE SUMMARY ### # License expression simplified by the special rule for OR expressions: # https://docs.fedoraproject.org/en-US/legal/license-field/#_special_rules_for_or_expressions %global license_expression %{shrink: Unicode-DFS-2016 AND Apache-2.0 AND (Apache-2.0 AND MIT) AND (Apache-2.0 OR BSL-1.0) AND (Apache-2.0 OR MIT OR Zlib) AND BSD-2-Clause AND MIT AND (Unlicense OR MIT) } SourceLicense: Apache-2.0 License: %{license_expression} URL: https://github.com/huggingface/tokenizers Source: %{pypi_source tokenizers} # A patch I wrote, updating the sources for PyO3 0.27 support # https://github.com/huggingface/tokenizers/pull/1941.patch Patch: 1941.patch BuildRequires: python3-devel BuildRequires: cargo-rpm-macros >= 24 BuildRequires: tomcli # For tests BuildRequires: python3dist(datasets) BuildRequires: python3dist(numpy) BuildRequires: python3dist(pytest) BuildRequires: python3dist(pytest-asyncio) BuildRequires: python3dist(pytest-datadir) BuildRequires: python3dist(pytest-xdist) %global _description %{expand: Provides an implementation of today's most used tokenizers, with a focus on performance and versatility. Bindings over the rust-tokenizers implementation.} %description %_description %package -n python3-tokenizers Summary: %{summary} %description -n python3-tokenizers %_description %prep %autosetup -p1 -n tokenizers-%{version} %cargo_prep # Copy out LICENSE cp -a tokenizers/LICENSE LICENSE # Remove vendored tokenizers rm -r tokenizers/ # Remove locked versions rm bindings/python/Cargo.lock # Replace the path-based dependency on the bundled crate with an exact-version # dependency. tomcli set bindings/python/Cargo.toml del dependencies.tokenizers.path tomcli set bindings/python/Cargo.toml str dependencies.tokenizers.version '=%{version}' %generate_buildrequires # Get the cargo buildrequires first, so that maturin will succeed cd bindings/python/ %cargo_generate_buildrequires cd ../../ %pyproject_buildrequires %build # Generate the dependency license file first, so maturin will find it cd bindings/python/ %cargo_license_summary %{cargo_license} > ../../LICENSE.dependencies cd ../../ %pyproject_wheel %install %pyproject_install # When saving the files, assert that a license file was found %pyproject_save_files -l tokenizers %check %pyproject_check_import cd bindings/python # Per the Makefile, this option is required for tests to pass %cargo_test -- --no-default-features # only run the tests, not the benches # the deselected tests are: # - test_datasets fails with: "TypeError: Pickler._batch_setitems() takes 2 positional arguments but 3 were given" # - test_gzip fails with: "FileNotFoundError: [Errno 2] No such file or directory: 'data/my-file.0.gz'" # - the rest of the deselects are network accesses %pytest -s -v ./tests/ \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_char_to_token" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_char_to_word" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_invalid_truncate_direction" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_n_sequences" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_sequence_ids" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_token_to_chars" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_token_to_sequence" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_token_to_word" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_truncation" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_word_to_chars" \ --deselect="tests/bindings/test_encoding.py::TestEncoding::test_word_to_tokens" \ --deselect="tests/bindings/test_models.py::TestBPE::test_instantiate" \ --deselect="tests/bindings/test_models.py::TestWordLevel::test_instantiate" \ --deselect="tests/bindings/test_models.py::TestWordPiece::test_instantiate" \ --deselect="tests/bindings/test_processors.py::TestByteLevelProcessing::test_processing" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_async_methods_existence" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_basic_encoding" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_concurrency" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_decode" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_encode" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_error_handling" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_large_batch" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_numpy_inputs" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_performance_comparison" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_various_input_formats" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_special_tokens" \ --deselect="tests/bindings/test_tokenizer.py::TestAsyncTokenizer::test_with_truncation_padding" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_skip_special_tokens" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_decode_stream_fallback" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_add_special_tokens" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_formats" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_encode_special_tokens" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_from_pretrained_revision" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_multiprocessing_with_parallelism" \ --deselect="tests/bindings/test_tokenizer.py::TestTokenizer::test_splitting" \ --deselect="tests/bindings/test_trainers.py::TestUnigram::test_continuing_prefix_trainer_mismatch" \ --deselect="tests/bindings/test_trainers.py::TestUnigram::test_train" \ --deselect="tests/bindings/test_trainers.py::TestUnigram::test_train_parallelism_with_custom_pretokenizer" \ --deselect="tests/documentation/test_pipeline.py::TestPipeline::test_bert_example" \ --deselect="tests/documentation/test_pipeline.py::TestPipeline::test_pipeline" \ --deselect="tests/documentation/test_quicktour.py::TestQuicktour::test_quicktour" \ --deselect="tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_datasets" \ --deselect="tests/documentation/test_tutorial_train_from_iterators.py::TestTrainFromIterators::test_gzip" \ --deselect="tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_basic_encode" \ --deselect="tests/implementations/test_bert_wordpiece.py::TestBertWordPieceTokenizer::test_multiprocessing_with_parallelism" \ --deselect="tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_add_prefix_space" \ --deselect="tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_basic_encode" \ --deselect="tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_lowerspace" \ --deselect="tests/implementations/test_byte_level_bpe.py::TestByteLevelBPE::test_multiprocessing_with_parallelism" \ --deselect="tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_basic_encode" \ --deselect="tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_decoding" \ --deselect="tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_lowercase" \ --deselect="tests/implementations/test_char_bpe.py::TestCharBPETokenizer::test_multiprocessing_with_parallelism" \ --deselect="tests/test_serialization.py::TestSerialization::test_full_serialization_albert" \ --deselect="tests/test_serialization.py::TestSerialization::test_str_big" cd ../../ %files -n python3-tokenizers -f %{pyproject_files} %doc bindings/python/README.md bindings/python/CHANGELOG.md %license LICENSE %license LICENSE.dependencies %changelog ## START: Generated by rpmautospec * Thu Feb 12 2026 Alexander F. Lent - 0.22.2-1 - Initial import of 0.22.2, fixes rhbz#2388154 ## END: Generated by rpmautospec