Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Problem using #include "tokenizers_c.h" #61

Open
doogunwo opened this issue Feb 13, 2025 · 0 comments
Open

Problem using #include "tokenizers_c.h" #61

doogunwo opened this issue Feb 13, 2025 · 0 comments

Comments

@doogunwo
Copy link

doogunwo commented Feb 13, 2025

Hi, I'm trying to use that C binding in SPDK.
I'm trying to test the movement before that,
Here is the code I use:

`#include <stdio.h>
#include <stdlib.h>
#include "tokenizers_c.h"
#include <string.h>
#include <assert.h>

size_t file_length(FILE *file){
assert(file);
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
rewind(file);
return length;
}

//
char* read_file(const char* filename, size_t* out_size) {
FILE* file = fopen(filename, "r");
if (!file) {
printf("Error: Cannot open file: %s\n", filename);
return NULL;
}
out_size = file_length(file);
char
buffer = (char*)malloc(*out_size + 1);
if (!buffer) {
printf("Memory allocation failed!\n");
fclose(file);
return NULL;
}
size_t read_bytes = fread(buffer, 1, *out_size, file);
fclose(file);
if (read_bytes != *out_size) {
printf("Error: File read mismatch (%zu != %zu)\n", read_bytes, *out_size);
free(buffer);
return NULL;
}
buffer[*out_size] = '\0'; //
return buffer;
}

int main() {
//
size_t vocab_len, merge_len;
char* vocab = read_file("./tokenizer_files/vocab.json", &vocab_len);
char* merges = read_file("./tokenizer_files/merges.json", &merge_len);

if (!vocab || !merges) {
    printf("Error: Failed to load vocab or merges file.\n");
    return 1;
}
printf("vocab.json & merges.json successfully loaded! (Vocab: %zu bytes, Merges: %zu bytes)\n", vocab_len, merge_len);

// 
TokenizerHandle tokenizer = byte_level_bpe_tokenizers_new_from_str(vocab, vocab_len, merges, merge_len, NULL, 0);
free(vocab);
free(merges);

if (!tokenizer) {
    printf("Tokenizer creation failed!\n");
    return 1;
}
printf("Tokenizer successfully created!\n");

// 
const char* test_sentence = "Hello, this is a BPE tokenizer test!";
TokenizerEncodeResult result;
tokenizers_encode(tokenizer, test_sentence, strlen(test_sentence), 1, &result);

if (!result.token_ids || result.len == 0) {
    printf("❌ Tokenization failed!\n");
    tokenizers_free(tokenizer);
    return 1;
}

// 
printf("Tokenized: ");
for (size_t i = 0; i < result.len; i++) {
    printf("%d ", result.token_ids[i]);
}
printf("\n");

// 
tokenizers_free_encode_results(&result, 1);
tokenizers_free(tokenizer);
printf(" Tokenizer cleanup completed.\n");

return 0;

}`

However, the following error occurs:
thread '<unnamed>' panicked at src/lib.rs:38:75: calledResult::unwrap()on anErrvalue: Error("EOF while parsing a value", line: 1, column: 0) note: run withRUST_BACKTRACE=1` environment variable to display a backtrace
thread '' panicked at core/src/panicking.rs:221:5:
panic in a function that cannot unwind
stack backtrace:
0: 0x7fae7cd9f1fa - std::backtrace_rs::backtrace::libunwind::trace::h5a5b8284f2d0c266
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/../../backtrace/src/backtrace/libunwind.rs:116:5
1: 0x7fae7cd9f1fa - std::backtrace_rs::backtrace::trace_unsynchronized::h76d4f1c9b0b875e3
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
2: 0x7fae7cd9f1fa - std::sys::backtrace::_print_fmt::hc4546b8364a537c6
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/sys/backtrace.rs:66:9
3: 0x7fae7cd9f1fa - <std::sys::backtrace::BacktraceLock::print::DisplayBacktrace as core::fmt::Display>::fmt::h5b6bd5631a6d1f6b
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/sys/backtrace.rs:39:26
4: 0x7fae7cded593 - core::fmt::rt::Argument::fmt::h270f6602a2b96f62
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/fmt/rt.rs:177:76
5: 0x7fae7cded593 - core::fmt::write::h7550c97b06c86515
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/fmt/mod.rs:1186:21
6: 0x7fae7cd935c3 - std::io::Write::write_fmt::h7b09c64fe0be9c84
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/io/mod.rs:1839:15
7: 0x7fae7cd9f042 - std::sys::backtrace::BacktraceLock::print::h2395ccd2c84ba3aa
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/sys/backtrace.rs:42:9
8: 0x7fae7cda164a - std::panicking::default_hook::{{closure}}::he19d4c7230e07961
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/panicking.rs:268:22
9: 0x7fae7cda1490 - std::panicking::default_hook::hf614597d3c67bbdb
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/panicking.rs:295:9
10: 0x7fae7cda1c87 - std::panicking::rust_panic_with_hook::h8942133a8b252070
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/panicking.rs:801:13
11: 0x7fae7cda1ae6 - std::panicking::begin_panic_handler::{{closure}}::hb5f5963570096b29
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/panicking.rs:667:13
12: 0x7fae7cd9f6d9 - std::sys::backtrace::__rust_end_short_backtrace::h6208cedc1922feda
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/sys/backtrace.rs:170:18
13: 0x7fae7cda17ac - rust_begin_unwind
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/std/src/panicking.rs:665:5
14: 0x7fae7caa82dd - core::panicking::panic_nounwind_fmt::runtime::h1f507a806003dfb2
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/panicking.rs:112:18
15: 0x7fae7caa82dd - core::panicking::panic_nounwind_fmt::h357fc035dc231634
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/panicking.rs:122:5
16: 0x7fae7caa8372 - core::panicking::panic_nounwind::hd0dad372654c389a
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/panicking.rs:221:5
17: 0x7fae7caa8536 - core::panicking::panic_cannot_unwind::h65aefd062253eb19
at /rustc/90b35a6239c3d8bdabc530a6a0816f7ff89a0aaf/library/core/src/panicking.rs:310:5
18: 0x7fae7cab1cbb - byte_level_bpe_tokenizers_new_from_str
19: 0x55d8368165ad - main
20: 0x7fae7c63e083 - __libc_start_main
at /build/glibc-FcRMwW/glibc-2.31/csu/../csu/libc-start.c:308:16
21: 0x55d8368162ae - _start
22: 0x0 -
thread caused non-unwinding panic. aborting.

`
I would like to know more about examples of using C language headers, or to get advice on errors. Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant