Critical bug fixes and recall improvements (#68)

This commit is contained in:
Eli Peter 2026-05-11 12:42:39 -04:00 committed by GitHub
parent 7d0e7320e2
commit 55247b7fcd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
352 changed files with 60069 additions and 900 deletions

View file

@ -1368,11 +1368,15 @@ fn truncate_prefix_lock(s: &str) -> String {
}
}
/// Longest common prefix, char-aligned so multi-byte UTF-8 sequences are
/// kept whole. The earlier byte-iteration form re-encoded continuation
/// bytes as Latin-1 chars and produced mojibake; the same fix lives at
/// `crate::abstract_interp::string_domain::longest_common_prefix`.
fn longest_common_prefix(a: &str, b: &str) -> String {
a.bytes()
.zip(b.bytes())
a.chars()
.zip(b.chars())
.take_while(|(x, y)| x == y)
.map(|(x, _)| x as char)
.map(|(x, _)| x)
.collect()
}
@ -1380,6 +1384,24 @@ fn longest_common_prefix(a: &str, b: &str) -> String {
mod tests {
use super::*;
// ── LCP helper ──────────────────────────────────────────────────────
#[test]
fn lcp_basic() {
assert_eq!(longest_common_prefix("abcdef", "abcxyz"), "abc");
assert_eq!(longest_common_prefix("abc", "abc"), "abc");
assert_eq!(longest_common_prefix("", "abc"), "");
}
#[test]
fn lcp_keeps_utf8_codepoints_whole() {
// Without char-alignment, byte iteration would emit the
// continuation byte 0xA9 as a separate char and corrupt the
// prefix. Both the 2-byte and 3-byte UTF-8 cases must survive.
assert_eq!(longest_common_prefix("héllo", "héllo!"), "héllo");
assert_eq!(longest_common_prefix("名前.json", "名前.txt"), "名前.");
}
// ── Tri lattice laws ────────────────────────────────────────────────
#[test]