From ed0fd9937fa5f77494c728827226a94d4295f670 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Mon, 6 Nov 2023 23:32:39 +0100 Subject: [PATCH] Use diacritic-ignoring, proper UCA comparison. --- bin/sync.pl | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/bin/sync.pl b/bin/sync.pl index ef07ee7..4351817 100644 --- a/bin/sync.pl +++ b/bin/sync.pl @@ -9,6 +9,7 @@ use DBI; use POSIX; use Time::HiRes; use IO::Select; +use Unicode::Collate; binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; use utf8; @@ -16,6 +17,7 @@ use utf8; require '../include/config.pm'; my @log = (); +my $uca = Unicode::Collate->new(level => 1); my %rgb = ( yellow => { @@ -44,6 +46,11 @@ sub log_timing { printf "%s: %.0f ms.\n", $msg, 1e3 * $elapsed; } +sub sort_key { + my $m = shift; + return $uca->getSortKey($m); +} + sub get_oauth_bearer_token { my ($dbh, $ua) = @_; my $now = time(); @@ -107,18 +114,18 @@ sub get_spreadsheet_name { sub matches_name { my ($slack_name, $spreadsheet_name) = @_; - if (lc($slack_name) eq lc($spreadsheet_name)) { + if (sort_key($slack_name) eq sort_key($spreadsheet_name)) { return 1; } my @ap = split /\s+/, $slack_name; my @bp = split /\s+/, $spreadsheet_name; - if (scalar @ap >= 2 && scalar @bp >= 2 && lc($ap[0]) eq lc($bp[0])) { + if (scalar @ap >= 2 && scalar @bp >= 2 && sort_key($ap[0]) eq sort_key($bp[0])) { # First name matches, try to match some surname my $found = 0; for my $ai (1..$#ap) { for my $bi (1..$#bp) { - $found = 1 if (lc($ap[$ai]) eq lc($bp[$bi])); + $found = 1 if (sort_key($ap[$ai]) eq sort_key($bp[$bi])); } } if ($found) { @@ -278,7 +285,7 @@ sub find_where_each_name_is { for my $val (@{$row->{'values'}}) { my $name = get_spreadsheet_name($val); if (defined($name)) { - push @{$seen_names{lc $name}}, [$name, $rowno, $colno]; + push @{$seen_names{sort_key($name)}}, [$name, $rowno, $colno]; } ++$colno; } @@ -397,10 +404,10 @@ sub find_diff { } for my $real_name (keys %$have_colors) { next if (exists($want_colors->{$real_name})); - if (!exists($seen_names->{lc $real_name})) { + if (!exists($seen_names->{sort_key($real_name)})) { # TODO: This can somehow come if we try to add someone who's not in the sheet, too? skv_log("Ønsket å fjerne at $real_name skulle på trening, men de var ikke i regnearket lenger."); - } elsif (scalar @{$seen_names->{lc $real_name}} > 1) { + } elsif (scalar @{$seen_names->{sort_key($real_name)}} > 1) { # Don't touch them. } else { skv_log("Fjerner at $real_name skal på trening."); @@ -599,7 +606,7 @@ sub run { $slack_userid_to_slack_name{$userid} = $slack_name; } - if (exists($seen_names{lc $slack_name})) { + if (exists($seen_names{sort_key($slack_name)})) { # The name exists exactly, once or more, so it's a direct match and we ignore any fuzz. $slack_userid_to_real_name{$userid} = $slack_name; push @slack_mapping_updates, { @@ -664,11 +671,11 @@ sub run { my $real_name = $slack_userid_to_real_name{$userid}; # See if we can find them in the spreadsheet. - if (!exists($seen_names{lc $real_name})) { + if (!exists($seen_names{sort_key($real_name)})) { # TODO: Perhaps move this logic further down, for consistency? skv_log("$slack_name ($userid) er påmeldt på Slack, og er mappet til $real_name, men var ikke i noen gruppe."); } else { - my $seen = $seen_names{lc $real_name}; + my $seen = $seen_names{sort_key($real_name)}; if (scalar @$seen >= 2) { skv_log("$slack_name ($userid) er påmeldt på Slack, men står flere steder (se over); vet ikke hvilken celle som skal brukes."); } else { @@ -695,7 +702,7 @@ sub run { for my $diff (@diffs) { my $real_name = $diff->[0]; - my $seen = $seen_names{lc $real_name}; + my $seen = $seen_names{sort_key($real_name)}; # We've already complained about these earlier, so just skip them silently. next if (scalar @$seen > 1); -- 2.39.2