nix-dotfiles/pkgs/edit/src/icu.rs

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

//! Bindings to the ICU library.

use std::cmp::Ordering;
use std::ffi::CStr;
use std::mem;
use std::mem::MaybeUninit;
use std::ops::Range;
use std::ptr::{null, null_mut};

use crate::arena::{Arena, ArenaString, scratch_arena};
use crate::buffer::TextBuffer;
use crate::unicode::Utf8Chars;
use crate::{apperr, arena_format, sys};

static mut ENCODINGS: Vec<&'static str> = Vec::new();

/// Returns a list of encodings ICU supports.
pub fn get_available_encodings() -> &'static [&'static str] {
    // OnceCell for people that want to put it into a static.
    #[allow(static_mut_refs)]
    unsafe {
        if ENCODINGS.is_empty() {
            if let Ok(f) = init_if_needed() {
                let mut n = 0;
                loop {
                    let name = (f.ucnv_getAvailableName)(n);
                    if name.is_null() {
                        break;
                    }
                    ENCODINGS.push(CStr::from_ptr(name).to_str().unwrap_unchecked());
                    n += 1;
                }
            }

            if ENCODINGS.is_empty() {
                ENCODINGS.push("UTF-8");
            }
        }
        &ENCODINGS
    }
}

/// Formats the given ICU error code into a human-readable string.
pub fn apperr_format(f: &mut std::fmt::Formatter<'_>, code: u32) -> std::fmt::Result {
    fn format(code: u32) -> &'static str {
        let Ok(f) = init_if_needed() else {
            return "";
        };

        let status = icu_ffi::UErrorCode::new(code);
        let ptr = unsafe { (f.u_errorName)(status) };
        if ptr.is_null() {
            return "";
        }

        let str = unsafe { CStr::from_ptr(ptr) };
        str.to_str().unwrap_or("")
    }

    let msg = format(code);
    if !msg.is_empty() {
        write!(f, "ICU Error: {msg}")
    } else {
        write!(f, "ICU Error: {code:#08x}")
    }
}

/// Converts between two encodings using ICU.
pub struct Converter<'pivot> {
    source: *mut icu_ffi::UConverter,
    target: *mut icu_ffi::UConverter,
    pivot_buffer: &'pivot mut [MaybeUninit<u16>],
    pivot_source: *mut u16,
    pivot_target: *mut u16,
    reset: bool,
}

impl Drop for Converter<'_> {
    fn drop(&mut self) {
        let f = assume_loaded();
        unsafe { (f.ucnv_close)(self.source) };
        unsafe { (f.ucnv_close)(self.target) };
    }
}

impl<'pivot> Converter<'pivot> {
    /// Constructs a new `Converter` instance.
    ///
    /// # Parameters
    ///
    /// * `pivot_buffer`: A buffer used to cache partial conversions.
    ///   Don't make it too small.
    /// * `source_encoding`: The source encoding name (e.g., "UTF-8").
    /// * `target_encoding`: The target encoding name (e.g., "UTF-16").
    pub fn new(
        pivot_buffer: &'pivot mut [MaybeUninit<u16>],
        source_encoding: &str,
        target_encoding: &str,
    ) -> apperr::Result<Self> {
        let f = init_if_needed()?;

        let arena = scratch_arena(None);
        let source_encoding = Self::append_nul(&arena, source_encoding);
        let target_encoding = Self::append_nul(&arena, target_encoding);

        let mut status = icu_ffi::U_ZERO_ERROR;
        let source = unsafe { (f.ucnv_open)(source_encoding.as_ptr(), &mut status) };
        let target = unsafe { (f.ucnv_open)(target_encoding.as_ptr(), &mut status) };
        if status.is_failure() {
            if !source.is_null() {
                unsafe { (f.ucnv_close)(source) };
            }
            if !target.is_null() {
                unsafe { (f.ucnv_close)(target) };
            }
            return Err(status.as_error());
        }

        let pivot_source = pivot_buffer.as_mut_ptr() as *mut u16;
        let pivot_target = unsafe { pivot_source.add(pivot_buffer.len()) };

        Ok(Self { source, target, pivot_buffer, pivot_source, pivot_target, reset: true })
    }

    fn append_nul<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> {
        arena_format!(arena, "{}\0", input)
    }

    /// Performs one step of the encoding conversion.
    ///
    /// # Parameters
    ///
    /// * `input`: The input buffer to convert from.
    ///   It should be in the `source_encoding` that was previously specified.
    /// * `output`: The output buffer to convert to.
    ///   It should be in the `target_encoding` that was previously specified.
    ///
    /// # Returns
    ///
    /// A tuple containing:
    /// 1. The number of bytes read from the input buffer.
    /// 2. The number of bytes written to the output buffer.
    pub fn convert(
        &mut self,
        input: &[u8],
        output: &mut [MaybeUninit<u8>],
    ) -> apperr::Result<(usize, usize)> {
        let f = assume_loaded();

        let input_beg = input.as_ptr();
        let input_end = unsafe { input_beg.add(input.len()) };
        let mut input_ptr = input_beg;

        let output_beg = output.as_mut_ptr() as *mut u8;
        let output_end = unsafe { output_beg.add(output.len()) };
        let mut output_ptr = output_beg;

        let pivot_beg = self.pivot_buffer.as_mut_ptr() as *mut u16;
        let pivot_end = unsafe { pivot_beg.add(self.pivot_buffer.len()) };

        let flush = input.is_empty();
        let mut status = icu_ffi::U_ZERO_ERROR;

        unsafe {
            (f.ucnv_convertEx)(
                /* target_cnv   */ self.target,
                /* source_cnv   */ self.source,
                /* target       */ &mut output_ptr,
                /* target_limit */ output_end,
                /* source       */ &mut input_ptr,
                /* source_limit */ input_end,
                /* pivot_start  */ pivot_beg,
                /* pivot_source */ &mut self.pivot_source,
                /* pivot_target */ &mut self.pivot_target,
                /* pivot_limit  */ pivot_end,
                /* reset        */ self.reset,
                /* flush        */ flush,
                /* status       */ &mut status,
            );
        }

        self.reset = false;
        if status.is_failure() && status != icu_ffi::U_BUFFER_OVERFLOW_ERROR {
            return Err(status.as_error());
        }

        let input_advance = unsafe { input_ptr.offset_from(input_beg) as usize };
        let output_advance = unsafe { output_ptr.offset_from(output_beg) as usize };
        Ok((input_advance, output_advance))
    }
}

// In benchmarking, I found that the performance does not really change much by changing this value.
// I picked 64 because it seemed like a reasonable lower bound.
const CACHE_SIZE: usize = 64;

/// Caches a chunk of TextBuffer contents (UTF-8) in UTF-16 format.
struct Cache {
    /// The translated text. Contains [`Cache::utf16_len`]-many valid items.
    utf16: [u16; CACHE_SIZE],
    /// For each character in [`Cache::utf16`] this stores the offset in the [`TextBuffer`],
    /// relative to the start offset stored in `native_beg`.
    /// This has the same length as [`Cache::utf16`].
    utf16_to_utf8_offsets: [u16; CACHE_SIZE],
    /// `utf8_to_utf16_offsets[native_offset - native_beg]` will tell you which character in
    /// [`Cache::utf16`] maps to the given `native_offset` in the underlying [`TextBuffer`].
    /// Contains `native_end - native_beg`-many valid items.
    utf8_to_utf16_offsets: [u16; CACHE_SIZE],

    /// The number of valid items in [`Cache::utf16`].
    utf16_len: usize,
    /// Offset of the first non-ASCII character.
    /// Less than or equal to [`Cache::utf16_len`].
    native_indexing_limit: usize,

    /// The range of UTF-8 text in the [`TextBuffer`] that this chunk covers.
    utf8_range: Range<usize>,
}

struct DoubleCache {
    cache: [Cache; 2],
    /// You can consider this a 1 bit index into `cache`.
    mru: bool,
}

/// A wrapper around ICU's `UText` struct.
///
/// In our case its only purpose is to adapt a [`TextBuffer`] for ICU.
///
/// # Safety
///
/// Warning! No lifetime tracking is done here.
/// I initially did it properly with a PhantomData marker for the TextBuffer
/// lifetime, but it was a pain so now I don't. Not a big deal in our case.
pub struct Text(&'static mut icu_ffi::UText);

impl Drop for Text {
    fn drop(&mut self) {
        let f = assume_loaded();
        unsafe { (f.utext_close)(self.0) };
    }
}

impl Text {
    /// Constructs an ICU `UText` instance from a [`TextBuffer`].
    ///
    /// # Safety
    ///
    /// The caller must ensure that the given [`TextBuffer`]
    /// outlives the returned `Text` instance.
    pub unsafe fn new(tb: &TextBuffer) -> apperr::Result<Self> {
        let f = init_if_needed()?;

        let mut status = icu_ffi::U_ZERO_ERROR;
        let ptr =
            unsafe { (f.utext_setup)(null_mut(), size_of::<DoubleCache>() as i32, &mut status) };
        if status.is_failure() {
            return Err(status.as_error());
        }

        const FUNCS: icu_ffi::UTextFuncs = icu_ffi::UTextFuncs {
            table_size: size_of::<icu_ffi::UTextFuncs>() as i32,
            reserved1: 0,
            reserved2: 0,
            reserved3: 0,
            clone: Some(utext_clone),
            native_length: Some(utext_native_length),
            access: Some(utext_access),
            extract: None,
            replace: None,
            copy: None,
            map_offset_to_native: Some(utext_map_offset_to_native),
            map_native_index_to_utf16: Some(utext_map_native_index_to_utf16),
            close: None,
            spare1: None,
            spare2: None,
            spare3: None,
        };

        let ut = unsafe { &mut *ptr };
        ut.p_funcs = &FUNCS;
        ut.context = tb as *const TextBuffer as *mut _;
        ut.a = tb.generation() as i64;

        // ICU unfortunately expects a `UText` instance to have valid contents after construction.
        utext_access(ut, 0, true);

        Ok(Self(ut))
    }
}

fn text_buffer_from_utext<'a>(ut: &icu_ffi::UText) -> &'a TextBuffer {
    unsafe { &*(ut.context as *const TextBuffer) }
}

fn double_cache_from_utext<'a>(ut: &icu_ffi::UText) -> &'a mut DoubleCache {
    unsafe { &mut *(ut.p_extra as *mut DoubleCache) }
}

extern "C" fn utext_clone(
    dest: *mut icu_ffi::UText,
    src: &icu_ffi::UText,
    deep: bool,
    status: &mut icu_ffi::UErrorCode,
) -> *mut icu_ffi::UText {
    if status.is_failure() {
        return null_mut();
    }

    if deep {
        *status = icu_ffi::U_UNSUPPORTED_ERROR;
        return null_mut();
    }

    let f = assume_loaded();
    let ut_ptr = unsafe { (f.utext_setup)(dest, size_of::<DoubleCache>() as i32, status) };
    if status.is_failure() {
        return null_mut();
    }

    unsafe {
        let ut = &mut *ut_ptr;
        let src_double_cache = double_cache_from_utext(src);
        let dst_double_cache = double_cache_from_utext(ut);
        let src_cache = &src_double_cache.cache[src_double_cache.mru as usize];
        let dst_cache = &mut dst_double_cache.cache[dst_double_cache.mru as usize];

        ut.provider_properties = src.provider_properties;
        ut.chunk_native_limit = src.chunk_native_limit;
        ut.native_indexing_limit = src.native_indexing_limit;
        ut.chunk_native_start = src.chunk_native_start;
        ut.chunk_offset = src.chunk_offset;
        ut.chunk_length = src.chunk_length;
        ut.chunk_contents = dst_cache.utf16.as_ptr();
        ut.p_funcs = src.p_funcs;
        ut.context = src.context;
        ut.a = src.a;

        // I wonder if it would make sense to use a Cow here. But probably not.
        std::ptr::copy_nonoverlapping(src_cache, dst_cache, 1);
    }

    ut_ptr
}

extern "C" fn utext_native_length(ut: &mut icu_ffi::UText) -> i64 {
    let tb = text_buffer_from_utext(ut);
    tb.text_length() as i64
}

extern "C" fn utext_access(ut: &mut icu_ffi::UText, native_index: i64, forward: bool) -> bool {
    if let Some(cache) = utext_access_impl(ut, native_index, forward) {
        let native_off = native_index as usize - cache.utf8_range.start;
        ut.chunk_contents = cache.utf16.as_ptr();
        ut.chunk_length = cache.utf16_len as i32;
        ut.chunk_offset = cache.utf8_to_utf16_offsets[native_off] as i32;
        ut.chunk_native_start = cache.utf8_range.start as i64;
        ut.chunk_native_limit = cache.utf8_range.end as i64;
        ut.native_indexing_limit = cache.native_indexing_limit as i32;
        true
    } else {
        false
    }
}

fn utext_access_impl<'a>(
    ut: &mut icu_ffi::UText,
    native_index: i64,
    forward: bool,
) -> Option<&'a mut Cache> {
    let tb = text_buffer_from_utext(ut);
    let mut index_contained = native_index;

    if !forward {
        index_contained -= 1;
    }
    if index_contained < 0 || index_contained as usize >= tb.text_length() {
        return None;
    }

    let index_contained = index_contained as usize;
    let native_index = native_index as usize;
    let double_cache = double_cache_from_utext(ut);
    let dirty = ut.a != tb.generation() as i64;

    if dirty {
        // The text buffer contents have changed.
        // Invalidate both caches so that future calls don't mistakenly use them
        // when they enter the for loop in the else branch below (`dirty == false`).
        double_cache.cache[0].utf16_len = 0;
        double_cache.cache[1].utf16_len = 0;
        double_cache.cache[0].utf8_range = 0..0;
        double_cache.cache[1].utf8_range = 0..0;
        ut.a = tb.generation() as i64;
    } else {
        // Check if one of the caches already contains the requested range.
        for (i, cache) in double_cache.cache.iter_mut().enumerate() {
            if cache.utf8_range.contains(&index_contained) {
                double_cache.mru = i != 0;
                return Some(cache);
            }
        }
    }

    // Turn the least recently used cache into the most recently used one.
    let double_cache = double_cache_from_utext(ut);
    double_cache.mru = !double_cache.mru;
    let cache = &mut double_cache.cache[double_cache.mru as usize];

    // In order to safely fit any UTF-8 character into our cache,
    // we must assume the worst case of a 4-byte long encoding.
    const UTF16_LEN_LIMIT: usize = CACHE_SIZE - 4;
    let utf8_len_limit;
    let native_start;

    if forward {
        utf8_len_limit = (tb.text_length() - native_index).min(UTF16_LEN_LIMIT);
        native_start = native_index;
    } else {
        // The worst case ratio for UTF-8 to UTF-16 is 1:1, when the text is ASCII.
        // This allows us to safely subtract the UTF-16 buffer size
        // and assume that whatever we read as UTF-8 will fit.
        // TODO: Test what happens if you have lots of invalid UTF-8 text blow up to U+FFFD.
        utf8_len_limit = native_index.min(UTF16_LEN_LIMIT);

        // Since simply subtracting an offset may end up in the middle of a codepoint sequence,
        // we must align the offset to the next codepoint boundary.
        // Here we skip trail bytes until we find a lead.
        let mut beg = native_index - utf8_len_limit;
        let chunk = tb.read_forward(beg);
        for &c in chunk {
            if c & 0b1100_0000 != 0b1000_0000 {
                break;
            }
            beg += 1;
        }

        native_start = beg;
    }

    // Translate the given range from UTF-8 to UTF-16.
    // NOTE: This code makes the assumption that the `native_index` is always
    // at UTF-8 codepoint boundaries which technically isn't guaranteed.
    let mut utf16_len = 0;
    let mut utf8_len = 0;
    let mut ascii_len = 0;
    'outer: loop {
        let initial_utf8_len = utf8_len;
        let chunk = tb.read_forward(native_start + utf8_len);
        if chunk.is_empty() {
            break;
        }

        let mut it = Utf8Chars::new(chunk, 0);

        // If we've only seen ASCII so far we can fast-pass the UTF-16 translation,
        // because we can just widen from u8 -> u16.
        if utf16_len == ascii_len {
            let haystack = &chunk[..chunk.len().min(utf8_len_limit - ascii_len)];

            // When it comes to performance, and the search space is small (which it is here),
            // it's always a good idea to keep the loops small and tight...
            let len = haystack.iter().position(|&c| c >= 0x80).unwrap_or(haystack.len());

            // ...In this case it allows the compiler to vectorize this loop and double
            // the performance. Luckily, llvm doesn't unroll the loop, which is great,
            // because `len` will always be a relatively small number.
            for &c in &chunk[..len] {
                unsafe {
                    *cache.utf16.get_unchecked_mut(ascii_len) = c as u16;
                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16;
                    *cache.utf8_to_utf16_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16;
                }
                ascii_len += 1;
            }

            utf16_len += len;
            utf8_len += len;
            it.seek(len);
            if ascii_len >= UTF16_LEN_LIMIT {
                break;
            }
        }

        loop {
            let Some(c) = it.next() else {
                break;
            };

            // Thanks to our `if utf16_len >= UTF16_LEN_LIMIT` check,
            // we can safely assume that this will fit.
            unsafe {
                let utf8_len_beg = utf8_len;
                let utf8_len_end = initial_utf8_len + it.offset();

                while utf8_len < utf8_len_end {
                    *cache.utf8_to_utf16_offsets.get_unchecked_mut(utf8_len) = utf16_len as u16;
                    utf8_len += 1;
                }

                if c <= '\u{FFFF}' {
                    *cache.utf16.get_unchecked_mut(utf16_len) = c as u16;
                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = utf8_len_beg as u16;
                    utf16_len += 1;
                } else {
                    let c = c as u32 - 0x10000;
                    let b = utf8_len_beg as u16;
                    *cache.utf16.get_unchecked_mut(utf16_len) = (c >> 10) as u16 | 0xD800;
                    *cache.utf16.get_unchecked_mut(utf16_len + 1) = (c & 0x3FF) as u16 | 0xDC00;
                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = b;
                    *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len + 1) = b;
                    utf16_len += 2;
                }
            }

            if utf16_len >= UTF16_LEN_LIMIT || utf8_len >= utf8_len_limit {
                break 'outer;
            }
        }
    }

    // Allow for looking up past-the-end indices via
    // `utext_map_offset_to_native` and `utext_map_native_index_to_utf16`.
    cache.utf16_to_utf8_offsets[utf16_len] = utf8_len as u16;
    cache.utf8_to_utf16_offsets[utf8_len] = utf16_len as u16;

    let native_limit = native_start + utf8_len;
    cache.utf16_len = utf16_len;
    // If parts of the UTF-8 chunk are ASCII, we can tell ICU that it doesn't need to call
    // utext_map_offset_to_native. For some reason, uregex calls that function *a lot*,
    // literally half the CPU time is spent on it.
    cache.native_indexing_limit = ascii_len;
    cache.utf8_range = native_start..native_limit;
    Some(cache)
}

extern "C" fn utext_map_offset_to_native(ut: &icu_ffi::UText) -> i64 {
    debug_assert!((0..=ut.chunk_length).contains(&ut.chunk_offset));

    let double_cache = double_cache_from_utext(ut);
    let cache = &double_cache.cache[double_cache.mru as usize];
    let off_rel = cache.utf16_to_utf8_offsets[ut.chunk_offset as usize];
    let off_abs = cache.utf8_range.start + off_rel as usize;
    off_abs as i64
}

extern "C" fn utext_map_native_index_to_utf16(ut: &icu_ffi::UText, native_index: i64) -> i32 {
    debug_assert!((ut.chunk_native_start..=ut.chunk_native_limit).contains(&native_index));

    let double_cache = double_cache_from_utext(ut);
    let cache = &double_cache.cache[double_cache.mru as usize];
    let off_rel = cache.utf8_to_utf16_offsets[(native_index - ut.chunk_native_start) as usize];
    off_rel as i32
}

/// A wrapper around ICU's `URegularExpression` struct.
///
/// # Safety
///
/// Warning! No lifetime tracking is done here.
pub struct Regex(&'static mut icu_ffi::URegularExpression);

impl Drop for Regex {
    fn drop(&mut self) {
        let f = assume_loaded();
        unsafe { (f.uregex_close)(self.0) };
    }
}

impl Regex {
    /// Enable case-insensitive matching.
    pub const CASE_INSENSITIVE: i32 = icu_ffi::UREGEX_CASE_INSENSITIVE;

    /// If set, ^ and $ match the start and end of each line.
    /// Otherwise, they match the start and end of the entire string.
    pub const MULTILINE: i32 = icu_ffi::UREGEX_MULTILINE;

    /// Treat the given pattern as a literal string.
    pub const LITERAL: i32 = icu_ffi::UREGEX_LITERAL;

    /// Constructs a regex, plain and simple. Read `uregex_open` docs.
    ///
    /// # Safety
    ///
    /// The caller must ensure that the given `Text` outlives the returned `Regex` instance.
    pub unsafe fn new(pattern: &str, flags: i32, text: &Text) -> apperr::Result<Self> {
        let f = init_if_needed()?;
        unsafe {
            let scratch = scratch_arena(None);
            let mut utf16 = Vec::new_in(&*scratch);
            let mut status = icu_ffi::U_ZERO_ERROR;

            utf16.extend(pattern.encode_utf16());

            let ptr = (f.uregex_open)(
                utf16.as_ptr(),
                utf16.len() as i32,
                icu_ffi::UREGEX_MULTILINE | icu_ffi::UREGEX_ERROR_ON_UNKNOWN_ESCAPES | flags,
                None,
                &mut status,
            );
            // ICU describes the time unit as being dependent on CPU performance
            // and "typically [in] the order of milliseconds", but this claim seems
            // highly outdated. On my CPU from 2021, a limit of 4096 equals roughly 600ms.
            (f.uregex_setTimeLimit)(ptr, 4096, &mut status);
            (f.uregex_setUText)(ptr, text.0 as *const _ as *mut _, &mut status);
            if status.is_failure() {
                return Err(status.as_error());
            }

            Ok(Self(&mut *ptr))
        }
    }

    /// Updates the regex pattern with the given text.
    /// If the text contents have changed, you can pass the same text as you used
    /// initially and it'll trigger ICU to reload the text and invalidate its caches.
    ///
    /// # Safety
    ///
    /// The caller must ensure that the given `Text` outlives the `Regex` instance.
    pub unsafe fn set_text(&mut self, text: &Text) {
        let f = assume_loaded();
        let mut status = icu_ffi::U_ZERO_ERROR;
        unsafe { (f.uregex_setUText)(self.0, text.0 as *const _ as *mut _, &mut status) };
    }

    /// Sets the regex to the absolute offset in the underlying text.
    pub fn reset(&mut self, index: usize) {
        let f = assume_loaded();
        let mut status = icu_ffi::U_ZERO_ERROR;
        unsafe { (f.uregex_reset64)(self.0, index as i64, &mut status) };
    }
}

impl Iterator for Regex {
    type Item = Range<usize>;

    fn next(&mut self) -> Option<Self::Item> {
        let f = assume_loaded();

        let mut status = icu_ffi::U_ZERO_ERROR;
        let ok = unsafe { (f.uregex_findNext)(self.0, &mut status) };
        if !ok {
            return None;
        }

        let start = unsafe { (f.uregex_start64)(self.0, 0, &mut status) };
        let end = unsafe { (f.uregex_end64)(self.0, 0, &mut status) };
        if status.is_failure() {
            return None;
        }

        let start = start.max(0);
        let end = end.max(start);
        Some(start as usize..end as usize)
    }
}

static mut ROOT_COLLATOR: Option<*mut icu_ffi::UCollator> = None;

/// Compares two UTF-8 strings for sorting using ICU's collation algorithm.
pub fn compare_strings(a: &[u8], b: &[u8]) -> Ordering {
    // OnceCell for people that want to put it into a static.
    #[allow(static_mut_refs)]
    let coll = unsafe {
        if ROOT_COLLATOR.is_none() {
            ROOT_COLLATOR = Some(if let Ok(f) = init_if_needed() {
                let mut status = icu_ffi::U_ZERO_ERROR;
                (f.ucol_open)(c"".as_ptr(), &mut status)
            } else {
                null_mut()
            });
        }
        ROOT_COLLATOR.unwrap_unchecked()
    };

    if coll.is_null() {
        compare_strings_ascii(a, b)
    } else {
        let f = assume_loaded();
        let mut status = icu_ffi::U_ZERO_ERROR;
        let res = unsafe {
            (f.ucol_strcollUTF8)(
                coll,
                a.as_ptr(),
                a.len() as i32,
                b.as_ptr(),
                b.len() as i32,
                &mut status,
            )
        };

        match res {
            icu_ffi::UCollationResult::UCOL_EQUAL => Ordering::Equal,
            icu_ffi::UCollationResult::UCOL_GREATER => Ordering::Greater,
            icu_ffi::UCollationResult::UCOL_LESS => Ordering::Less,
        }
    }
}

/// Unicode collation via `ucol_strcollUTF8`, now for ASCII!
fn compare_strings_ascii(a: &[u8], b: &[u8]) -> Ordering {
    let mut iter = a.iter().zip(b.iter());

    // Low weight: Find the first character which differs.
    //
    // Remember that result in case all remaining characters are
    // case-insensitive equal, because then we use that as a fallback.
    while let Some((&a, &b)) = iter.next() {
        if a != b {
            let mut order = a.cmp(&b);
            let la = a.to_ascii_lowercase();
            let lb = b.to_ascii_lowercase();

            if la == lb {
                // High weight: Find the first character which
                // differs case-insensitively.
                for (a, b) in iter {
                    let la = a.to_ascii_lowercase();
                    let lb = b.to_ascii_lowercase();

                    if la != lb {
                        order = la.cmp(&lb);
                        break;
                    }
                }
            }

            return order;
        }
    }

    // Fallback: The shorter string wins.
    a.len().cmp(&b.len())
}

static mut ROOT_CASEMAP: Option<*mut icu_ffi::UCaseMap> = None;

/// Converts the given UTF-8 string to lower case.
///
/// Case folding differs from lower case in that the output is primarily useful
/// to machines for comparisons. It's like applying Unicode normalization.
pub fn fold_case<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> {
    // OnceCell for people that want to put it into a static.
    #[allow(static_mut_refs)]
    let casemap = unsafe {
        if ROOT_CASEMAP.is_none() {
            ROOT_CASEMAP = Some(if let Ok(f) = init_if_needed() {
                let mut status = icu_ffi::U_ZERO_ERROR;
                (f.ucasemap_open)(null(), 0, &mut status)
            } else {
                null_mut()
            })
        }
        ROOT_CASEMAP.unwrap_unchecked()
    };

    if !casemap.is_null() {
        let f = assume_loaded();
        let mut status = icu_ffi::U_ZERO_ERROR;
        let mut output = Vec::new_in(arena);
        let mut output_len;

        // First, guess the output length:
        // TODO: What's a good heuristic here?
        {
            output.reserve_exact(input.len() + 16);
            let output = output.spare_capacity_mut();
            output_len = unsafe {
                (f.ucasemap_utf8FoldCase)(
                    casemap,
                    output.as_mut_ptr() as *mut _,
                    output.len() as i32,
                    input.as_ptr() as *const _,
                    input.len() as i32,
                    &mut status,
                )
            };
        }

        // If that failed to fit, retry with the correct length.
        if status == icu_ffi::U_BUFFER_OVERFLOW_ERROR && output_len > 0 {
            output.reserve_exact(output_len as usize);
            let output = output.spare_capacity_mut();
            output_len = unsafe {
                (f.ucasemap_utf8FoldCase)(
                    casemap,
                    output.as_mut_ptr() as *mut _,
                    output.len() as i32,
                    input.as_ptr() as *const _,
                    input.len() as i32,
                    &mut status,
                )
            };
        }

        if status.is_success() && output_len > 0 {
            unsafe {
                output.set_len(output_len as usize);
            }
            return unsafe { ArenaString::from_utf8_unchecked(output) };
        }
    }

    let mut result = ArenaString::from_str(arena, input);
    for b in unsafe { result.as_bytes_mut() } {
        b.make_ascii_lowercase();
    }
    result
}

// WARNING:
// The order of the fields MUST match the order of strings in the following two arrays.
#[allow(non_snake_case)]
#[repr(C)]
struct LibraryFunctions {
    // LIBICUUC_PROC_NAMES
    u_errorName: icu_ffi::u_errorName,
    ucnv_getAvailableName: icu_ffi::ucnv_getAvailableName,
    ucnv_open: icu_ffi::ucnv_open,
    ucnv_close: icu_ffi::ucnv_close,
    ucnv_convertEx: icu_ffi::ucnv_convertEx,
    ucasemap_open: icu_ffi::ucasemap_open,
    ucasemap_utf8FoldCase: icu_ffi::ucasemap_utf8FoldCase,
    utext_setup: icu_ffi::utext_setup,
    utext_close: icu_ffi::utext_close,

    // LIBICUI18N_PROC_NAMES
    uregex_open: icu_ffi::uregex_open,
    uregex_close: icu_ffi::uregex_close,
    uregex_setTimeLimit: icu_ffi::uregex_setTimeLimit,
    uregex_setUText: icu_ffi::uregex_setUText,
    uregex_reset64: icu_ffi::uregex_reset64,
    uregex_findNext: icu_ffi::uregex_findNext,
    uregex_start64: icu_ffi::uregex_start64,
    uregex_end64: icu_ffi::uregex_end64,
    ucol_open: icu_ffi::ucol_open,
    ucol_strcollUTF8: icu_ffi::ucol_strcollUTF8,
}

const LIBICUUC_PROC_NAMES: [&CStr; 9] = [
    // Found in libicuuc.so on UNIX, icuuc.dll/icu.dll on Windows.
    c"u_errorName",
    c"ucnv_getAvailableName",
    c"ucnv_open",
    c"ucnv_close",
    c"ucnv_convertEx",
    c"ucasemap_open",
    c"ucasemap_utf8FoldCase",
    c"utext_setup",
    c"utext_close",
];

const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
    // Found in libicui18n.so on UNIX, icuin.dll/icu.dll on Windows.
    c"uregex_open",
    c"uregex_close",
    c"uregex_setTimeLimit",
    c"uregex_setUText",
    c"uregex_reset64",
    c"uregex_findNext",
    c"uregex_start64",
    c"uregex_end64",
    c"ucol_open",
    c"ucol_strcollUTF8",
];

enum LibraryFunctionsState {
    Uninitialized,
    Failed,
    Loaded(LibraryFunctions),
}

static mut LIBRARY_FUNCTIONS: LibraryFunctionsState = LibraryFunctionsState::Uninitialized;

pub fn init() -> apperr::Result<()> {
    init_if_needed()?;
    Ok(())
}

#[allow(static_mut_refs)]
fn init_if_needed() -> apperr::Result<&'static LibraryFunctions> {
    #[cold]
    fn load() {
        unsafe {
            LIBRARY_FUNCTIONS = LibraryFunctionsState::Failed;

            let Ok(libicuuc) = sys::load_libicuuc() else {
                return;
            };
            let Ok(libicui18n) = sys::load_libicui18n() else {
                return;
            };

            type TransparentFunction = unsafe extern "C" fn() -> *const ();

            // OH NO I'M DOING A BAD THING
            //
            // If this assertion hits, you either forgot to update `LIBRARY_PROC_NAMES`
            // or you're on a platform where `dlsym` behaves different from classic UNIX and Windows.
            //
            // This code assumes that we can treat the `LibraryFunctions` struct containing various different function
            // pointers as an array of `TransparentFunction` pointers. In C, this works on any platform that supports
            // POSIX `dlsym` or equivalent, but I suspect Rust is once again being extra about it. In any case, that's
            // still better than loading every function one by one, just to blow up our binary size for no reason.
            const _: () = assert!(
                mem::size_of::<LibraryFunctions>()
                    == mem::size_of::<TransparentFunction>()
                        * (LIBICUUC_PROC_NAMES.len() + LIBICUI18N_PROC_NAMES.len())
            );

            let mut funcs = MaybeUninit::<LibraryFunctions>::uninit();
            let mut ptr = funcs.as_mut_ptr() as *mut TransparentFunction;

            #[cfg(unix)]
            let scratch_outer = scratch_arena(None);
            #[cfg(unix)]
            let suffix = sys::icu_proc_suffix(&scratch_outer, libicuuc);

            for (handle, names) in
                [(libicuuc, &LIBICUUC_PROC_NAMES[..]), (libicui18n, &LIBICUI18N_PROC_NAMES[..])]
            {
                for name in names {
                    #[cfg(unix)]
                    let scratch = scratch_arena(Some(&scratch_outer));
                    #[cfg(unix)]
                    let name = &sys::add_icu_proc_suffix(&scratch, name, &suffix);

                    let Ok(func) = sys::get_proc_address(handle, name) else {
                        debug_assert!(
                            false,
                            "Failed to load ICU function: {}",
                            name.to_string_lossy()
                        );
                        return;
                    };

                    ptr.write(func);
                    ptr = ptr.add(1);
                }
            }

            LIBRARY_FUNCTIONS = LibraryFunctionsState::Loaded(funcs.assume_init());
        }
    }

    unsafe {
        if matches!(&LIBRARY_FUNCTIONS, LibraryFunctionsState::Uninitialized) {
            load();
        }
    }

    match unsafe { &LIBRARY_FUNCTIONS } {
        LibraryFunctionsState::Loaded(f) => Ok(f),
        _ => Err(apperr::APP_ICU_MISSING),
    }
}

#[allow(static_mut_refs)]
fn assume_loaded() -> &'static LibraryFunctions {
    match unsafe { &LIBRARY_FUNCTIONS } {
        LibraryFunctionsState::Loaded(f) => f,
        _ => unreachable!(),
    }
}

mod icu_ffi {
    #![allow(dead_code, non_camel_case_types)]

    use std::ffi::{c_char, c_int, c_void};

    use crate::apperr;

    #[derive(Copy, Clone, Eq, PartialEq)]
    #[repr(transparent)]
    pub struct UErrorCode(c_int);

    impl UErrorCode {
        pub const fn new(code: u32) -> Self {
            Self(code as c_int)
        }

        pub fn is_success(&self) -> bool {
            self.0 <= 0
        }

        pub fn is_failure(&self) -> bool {
            self.0 > 0
        }

        pub fn as_error(&self) -> apperr::Error {
            debug_assert!(self.0 > 0);
            apperr::Error::new_icu(self.0 as u32)
        }
    }

    pub const U_ZERO_ERROR: UErrorCode = UErrorCode(0);
    pub const U_BUFFER_OVERFLOW_ERROR: UErrorCode = UErrorCode(15);
    pub const U_UNSUPPORTED_ERROR: UErrorCode = UErrorCode(16);

    pub type u_errorName = unsafe extern "C" fn(code: UErrorCode) -> *const c_char;

    pub struct UConverter;

    pub type ucnv_getAvailableName = unsafe extern "C" fn(n: i32) -> *mut c_char;

    pub type ucnv_open =
        unsafe extern "C" fn(converter_name: *const u8, status: &mut UErrorCode) -> *mut UConverter;

    pub type ucnv_close = unsafe extern "C" fn(converter: *mut UConverter);

    pub type ucnv_convertEx = unsafe extern "C" fn(
        target_cnv: *mut UConverter,
        source_cnv: *mut UConverter,
        target: *mut *mut u8,
        target_limit: *const u8,
        source: *mut *const u8,
        source_limit: *const u8,
        pivot_start: *mut u16,
        pivot_source: *mut *mut u16,
        pivot_target: *mut *mut u16,
        pivot_limit: *const u16,
        reset: bool,
        flush: bool,
        status: &mut UErrorCode,
    );

    pub struct UCaseMap;

    pub type ucasemap_open = unsafe extern "C" fn(
        locale: *const c_char,
        options: u32,
        status: &mut UErrorCode,
    ) -> *mut UCaseMap;

    pub type ucasemap_utf8FoldCase = unsafe extern "C" fn(
        csm: *const UCaseMap,
        dest: *mut c_char,
        dest_capacity: i32,
        src: *const c_char,
        src_length: i32,
        status: &mut UErrorCode,
    ) -> i32;

    #[repr(C)]
    pub enum UCollationResult {
        UCOL_EQUAL = 0,
        UCOL_GREATER = 1,
        UCOL_LESS = -1,
    }

    #[repr(C)]
    pub struct UCollator;

    pub type ucol_open =
        unsafe extern "C" fn(loc: *const c_char, status: &mut UErrorCode) -> *mut UCollator;

    pub type ucol_strcollUTF8 = unsafe extern "C" fn(
        coll: *mut UCollator,
        source: *const u8,
        source_length: i32,
        target: *const u8,
        target_length: i32,
        status: &mut UErrorCode,
    ) -> UCollationResult;

    // UText callback functions
    pub type UTextClone = unsafe extern "C" fn(
        dest: *mut UText,
        src: &UText,
        deep: bool,
        status: &mut UErrorCode,
    ) -> *mut UText;
    pub type UTextNativeLength = unsafe extern "C" fn(ut: &mut UText) -> i64;
    pub type UTextAccess =
        unsafe extern "C" fn(ut: &mut UText, native_index: i64, forward: bool) -> bool;
    pub type UTextExtract = unsafe extern "C" fn(
        ut: &mut UText,
        native_start: i64,
        native_limit: i64,
        dest: *mut u16,
        dest_capacity: i32,
        status: &mut UErrorCode,
    ) -> i32;
    pub type UTextReplace = unsafe extern "C" fn(
        ut: &mut UText,
        native_start: i64,
        native_limit: i64,
        replacement_text: *const u16,
        replacement_length: i32,
        status: &mut UErrorCode,
    ) -> i32;
    pub type UTextCopy = unsafe extern "C" fn(
        ut: &mut UText,
        native_start: i64,
        native_limit: i64,
        native_dest: i64,
        move_text: bool,
        status: &mut UErrorCode,
    );
    pub type UTextMapOffsetToNative = unsafe extern "C" fn(ut: &UText) -> i64;
    pub type UTextMapNativeIndexToUTF16 =
        unsafe extern "C" fn(ut: &UText, native_index: i64) -> i32;
    pub type UTextClose = unsafe extern "C" fn(ut: &mut UText);

    #[repr(C)]
    pub struct UTextFuncs {
        pub table_size: i32,
        pub reserved1: i32,
        pub reserved2: i32,
        pub reserved3: i32,
        pub clone: Option<UTextClone>,
        pub native_length: Option<UTextNativeLength>,
        pub access: Option<UTextAccess>,
        pub extract: Option<UTextExtract>,
        pub replace: Option<UTextReplace>,
        pub copy: Option<UTextCopy>,
        pub map_offset_to_native: Option<UTextMapOffsetToNative>,
        pub map_native_index_to_utf16: Option<UTextMapNativeIndexToUTF16>,
        pub close: Option<UTextClose>,
        pub spare1: Option<UTextClose>,
        pub spare2: Option<UTextClose>,
        pub spare3: Option<UTextClose>,
    }

    #[repr(C)]
    pub struct UText {
        pub magic: u32,
        pub flags: i32,
        pub provider_properties: i32,
        pub size_of_struct: i32,
        pub chunk_native_limit: i64,
        pub extra_size: i32,
        pub native_indexing_limit: i32,
        pub chunk_native_start: i64,
        pub chunk_offset: i32,
        pub chunk_length: i32,
        pub chunk_contents: *const u16,
        pub p_funcs: &'static UTextFuncs,
        pub p_extra: *mut c_void,
        pub context: *mut c_void,
        pub p: *mut c_void,
        pub q: *mut c_void,
        pub r: *mut c_void,
        pub priv_p: *mut c_void,
        pub a: i64,
        pub b: i32,
        pub c: i32,
        pub priv_a: i64,
        pub priv_b: i32,
        pub priv_c: i32,
    }

    pub const UTEXT_MAGIC: u32 = 0x345ad82c;
    pub const UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE: i32 = 1;
    pub const UTEXT_PROVIDER_STABLE_CHUNKS: i32 = 2;
    pub const UTEXT_PROVIDER_WRITABLE: i32 = 3;
    pub const UTEXT_PROVIDER_HAS_META_DATA: i32 = 4;
    pub const UTEXT_PROVIDER_OWNS_TEXT: i32 = 5;

    pub type utext_setup = unsafe extern "C" fn(
        ut: *mut UText,
        extra_space: i32,
        status: &mut UErrorCode,
    ) -> *mut UText;
    pub type utext_close = unsafe extern "C" fn(ut: *mut UText) -> *mut UText;

    #[repr(C)]
    pub struct UParseError {
        pub line: i32,
        pub offset: i32,
        pub pre_context: [u16; 16],
        pub post_context: [u16; 16],
    }

    #[repr(C)]
    pub struct URegularExpression;

    pub const UREGEX_UNIX_LINES: i32 = 1;
    pub const UREGEX_CASE_INSENSITIVE: i32 = 2;
    pub const UREGEX_COMMENTS: i32 = 4;
    pub const UREGEX_MULTILINE: i32 = 8;
    pub const UREGEX_LITERAL: i32 = 16;
    pub const UREGEX_DOTALL: i32 = 32;
    pub const UREGEX_UWORD: i32 = 256;
    pub const UREGEX_ERROR_ON_UNKNOWN_ESCAPES: i32 = 512;

    pub type uregex_open = unsafe extern "C" fn(
        pattern: *const u16,
        pattern_length: i32,
        flags: i32,
        pe: Option<&mut UParseError>,
        status: &mut UErrorCode,
    ) -> *mut URegularExpression;
    pub type uregex_close = unsafe extern "C" fn(regexp: *mut URegularExpression);
    pub type uregex_setTimeLimit =
        unsafe extern "C" fn(regexp: *mut URegularExpression, limit: i32, status: &mut UErrorCode);
    pub type uregex_setUText = unsafe extern "C" fn(
        regexp: *mut URegularExpression,
        text: *mut UText,
        status: &mut UErrorCode,
    );
    pub type uregex_reset64 =
        unsafe extern "C" fn(regexp: *mut URegularExpression, index: i64, status: &mut UErrorCode);
    pub type uregex_findNext =
        unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> bool;
    pub type uregex_start64 = unsafe extern "C" fn(
        regexp: *mut URegularExpression,
        group_num: i32,
        status: &mut UErrorCode,
    ) -> i64;
    pub type uregex_end64 = unsafe extern "C" fn(
        regexp: *mut URegularExpression,
        group_num: i32,
        status: &mut UErrorCode,
    ) -> i64;
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_compare_strings_ascii() {
        // Empty strings
        assert_eq!(compare_strings_ascii(b"", b""), Ordering::Equal);
        // Equal strings
        assert_eq!(compare_strings_ascii(b"hello", b"hello"), Ordering::Equal);
        // Different lengths
        assert_eq!(compare_strings_ascii(b"abc", b"abcd"), Ordering::Less);
        assert_eq!(compare_strings_ascii(b"abcd", b"abc"), Ordering::Greater);
        // Same chars, different cases - 1st char wins
        assert_eq!(compare_strings_ascii(b"AbC", b"aBc"), Ordering::Less);
        // Different chars, different cases - 2nd char wins, because it differs
        assert_eq!(compare_strings_ascii(b"hallo", b"Hello"), Ordering::Less);
        assert_eq!(compare_strings_ascii(b"Hello", b"hallo"), Ordering::Greater);
    }
}