diff options
Diffstat (limited to 'contrib/recode.pl')
-rwxr-xr-x | contrib/recode.pl | 315 |
1 files changed, 161 insertions, 154 deletions
diff --git a/contrib/recode.pl b/contrib/recode.pl index e6da47b92..9dc1d1bf5 100755 --- a/contrib/recode.pl +++ b/contrib/recode.pl @@ -32,21 +32,26 @@ use constant MAX_STRING_LEN => 25; # For certain tables, we can't automatically determine their Primary Key. # So, we specify it here as a string. use constant SPECIAL_KEYS => { - # bugs_activity since 4.4 has a unique primary key added - bugs_activity => 'bug_id,bug_when,fieldid', - profile_setting => 'user_id,setting_name', - # profiles_activity since 4.4 has a unique primary key added - profiles_activity => 'userid,profiles_when,fieldid', - setting_value => 'name,value', - # longdescs didn't used to have a PK, before 2.20. - longdescs => 'bug_id,bug_when', - # The 2.16 versions table lacked a PK - versions => 'product_id,value', - # These are all for earlier versions of Bugzilla. On a modern - # version of Bugzilla, this script will ignore these (thanks to - # code further down). - components => 'program,value', - products => 'product', + + # bugs_activity since 4.4 has a unique primary key added + bugs_activity => 'bug_id,bug_when,fieldid', + profile_setting => 'user_id,setting_name', + + # profiles_activity since 4.4 has a unique primary key added + profiles_activity => 'userid,profiles_when,fieldid', + setting_value => 'name,value', + + # longdescs didn't used to have a PK, before 2.20. + longdescs => 'bug_id,bug_when', + + # The 2.16 versions table lacked a PK + versions => 'product_id,value', + + # These are all for earlier versions of Bugzilla. On a modern + # version of Bugzilla, this script will ignore these (thanks to + # code further down). + components => 'program,value', + products => 'product', }; ############### @@ -55,18 +60,18 @@ use constant SPECIAL_KEYS => { # "truncate" is a file operation in perl, so we can't use that name. sub trunc { - my ($str) = @_; - my $truncated = substr($str, 0, MAX_STRING_LEN); - if (length($truncated) ne length($str)) { - $truncated .= '...'; - } - return $truncated; + my ($str) = @_; + my $truncated = substr($str, 0, MAX_STRING_LEN); + if (length($truncated) ne length($str)) { + $truncated .= '...'; + } + return $truncated; } sub is_valid_utf8 { - my ($str) = @_; - Encode::_utf8_on($str); - return is_utf8($str, 1); + my ($str) = @_; + Encode::_utf8_on($str); + return is_utf8($str, 1); } ############### @@ -75,170 +80,172 @@ sub is_valid_utf8 { my %switch; GetOptions(\%switch, 'dry-run', 'guess', 'charset=s', 'show-failures', - 'overrides=s', 'help|h'); + 'overrides=s', 'help|h'); -pod2usage({ -verbose => 1 }) if $switch{'help'}; +pod2usage({-verbose => 1}) if $switch{'help'}; # You have to specify at least one of these switches. -pod2usage({ -verbose => 0 }) if (!$switch{'charset'} && !$switch{'guess'}); +pod2usage({-verbose => 0}) if (!$switch{'charset'} && !$switch{'guess'}); if (exists $switch{'charset'}) { - $switch{'charset'} = resolve_alias($switch{'charset'}) - || die "'$switch{charset}' is not a valid charset."; + $switch{'charset'} = resolve_alias($switch{'charset'}) + || die "'$switch{charset}' is not a valid charset."; } if ($switch{'guess'}) { - if (!eval { require Encode::Detect::Detector }) { - my $root = ROOT_USER; - print STDERR <<EOT; + if (!eval { require Encode::Detect::Detector }) { + my $root = ROOT_USER; + print STDERR <<EOT; Using --guess requires that Encode::Detect be installed. To install Encode::Detect, run the following command: $^X install-module.pl Encode::Detect EOT - exit; - } + exit; + } } my %overrides; if (exists $switch{'overrides'}) { - my $file = new IO::File($switch{'overrides'}, 'r') - || die "$switch{overrides}: $!"; - my @lines = $file->getlines(); - $file->close(); - foreach my $line (@lines) { - chomp($line); - my ($digest, $encoding) = split(' ', $line); - $overrides{$digest} = $encoding; - } + my $file = new IO::File($switch{'overrides'}, 'r') + || die "$switch{overrides}: $!"; + my @lines = $file->getlines(); + $file->close(); + foreach my $line (@lines) { + chomp($line); + my ($digest, $encoding) = split(' ', $line); + $overrides{$digest} = $encoding; + } } my $dbh = Bugzilla->dbh; if ($dbh->isa('Bugzilla::DB::Mysql')) { - # Get the actual current encoding of the DB. - my $collation_data = $dbh->selectrow_arrayref( - "SHOW VARIABLES LIKE 'character_set_database'"); - my $db_charset = $collation_data->[1]; - # Set our connection encoding to *that* encoding, so that MySQL - # correctly accepts our changes. - $dbh->do("SET NAMES $db_charset"); - # Make the database give us raw bytes. - $dbh->do('SET character_set_results = NULL') + + # Get the actual current encoding of the DB. + my $collation_data + = $dbh->selectrow_arrayref("SHOW VARIABLES LIKE 'character_set_database'"); + my $db_charset = $collation_data->[1]; + + # Set our connection encoding to *that* encoding, so that MySQL + # correctly accepts our changes. + $dbh->do("SET NAMES $db_charset"); + + # Make the database give us raw bytes. + $dbh->do('SET character_set_results = NULL'); } $dbh->begin_work; foreach my $table ($dbh->bz_table_list_real) { - my @columns = $dbh->bz_table_columns($table); - - my $pk = SPECIAL_KEYS->{$table}; - if ($pk) { - # Assure that we're on a version of Bugzilla where those keys - # actually exist. - foreach my $column (split ',', $pk) { - $pk = undef if !$dbh->bz_column_info($table, $column); - } + my @columns = $dbh->bz_table_columns($table); + + my $pk = SPECIAL_KEYS->{$table}; + if ($pk) { + + # Assure that we're on a version of Bugzilla where those keys + # actually exist. + foreach my $column (split ',', $pk) { + $pk = undef if !$dbh->bz_column_info($table, $column); } + } - # Figure out the primary key. + # Figure out the primary key. + foreach my $column (@columns) { + my $def = $dbh->bz_column_info($table, $column); + $pk = $column if $def->{PRIMARYKEY}; + } + + # If there's no PK, it's defined by a UNIQUE index. + if (!$pk) { foreach my $column (@columns) { - my $def = $dbh->bz_column_info($table, $column); - $pk = $column if $def->{PRIMARYKEY}; - } - # If there's no PK, it's defined by a UNIQUE index. - if (!$pk) { - foreach my $column (@columns) { - my $index = $dbh->bz_index_info($table, "${table}_${column}_idx"); - if ($index && ref($index) eq 'HASH') { - $pk = join(',', @{$index->{FIELDS}}) - if $index->{TYPE} eq 'UNIQUE'; - } - } + my $index = $dbh->bz_index_info($table, "${table}_${column}_idx"); + if ($index && ref($index) eq 'HASH') { + $pk = join(',', @{$index->{FIELDS}}) if $index->{TYPE} eq 'UNIQUE'; + } } + } - foreach my $column (@columns) { - my $def = $dbh->bz_column_info($table, $column); - # If this is a text column, it may need work. - if ($def->{TYPE} =~ /text|char/i) { - # If there's still no PK, we're upgrading from 2.14 or earlier. - # We can't reliably determine the PK (or at least, I don't want to - # maintain code to record what the PK was at all points in history). - # So instead we just use the field itself. - $pk = $column if !$pk; - - print "Converting $table.$column...\n"; - my $sth = $dbh->prepare("SELECT $column, $pk FROM $table + foreach my $column (@columns) { + my $def = $dbh->bz_column_info($table, $column); + + # If this is a text column, it may need work. + if ($def->{TYPE} =~ /text|char/i) { + + # If there's still no PK, we're upgrading from 2.14 or earlier. + # We can't reliably determine the PK (or at least, I don't want to + # maintain code to record what the PK was at all points in history). + # So instead we just use the field itself. + $pk = $column if !$pk; + + print "Converting $table.$column...\n"; + my $sth = $dbh->prepare( + "SELECT $column, $pk FROM $table WHERE $column IS NOT NULL - AND $column != ''"); - - my @pk_array = map {"$_ = ?"} split(',', $pk); - my $pk_where = join(' AND ', @pk_array); - my $update_sth = $dbh->prepare( - "UPDATE $table SET $column = ? WHERE $pk_where"); - - $sth->execute(); - - while (my @result = $sth->fetchrow_array) { - my $data = shift @result; - # Wide characters cause md5_base64() to die. - my $digest_data = utf8::is_utf8($data) - ? Encode::encode_utf8($data) : $data; - my $digest = md5_base64($digest_data); - - my @primary_keys = reverse split(',', $pk); - # We copy the array so that we can pop things from it without - # affecting the original. - my @pk_data = @result; - my $pk_line = join (', ', - map { "$_ = " . pop @pk_data } @primary_keys); - - my $encoding; - if ($switch{'guess'}) { - $encoding = detect_encoding($data); - - # We only show failures if they don't appear to be - # ASCII. - if ($switch{'show-failures'} && !$encoding - && !is_valid_utf8($data)) - { - my $truncated = trunc($data); - print "Row: [$pk_line]\n", - "Failed to guess: Key: $digest", - " DATA: $truncated\n"; - } - - # If we fail a guess, and the data is valid UTF-8, - # just assume we failed because it's UTF-8. - next if is_valid_utf8($data); - } - - # If we couldn't detect the charset (or were instructed - # not to try), we fall back to --charset. If there's no - # fallback, we just do nothing. - if (!$encoding && $switch{'charset'}) { - $encoding = $switch{'charset'}; - } - - $encoding = $overrides{$digest} if $overrides{$digest}; - - # We only fix it if it's not ASCII or UTF-8 already. - if ($encoding && !grep($_ eq $encoding, IGNORE_ENCODINGS)) { - my $decoded = encode('utf8', decode($encoding, $data)); - if ($switch{'dry-run'} && $data ne $decoded) { - print "Row: [$pk_line]\n", - "From: [" . trunc($data) . "] Key: $digest\n", - "To: [" . trunc($decoded) . "]", - " Encoding : $encoding\n"; - } - else { - $update_sth->execute($decoded, @result); - } - } - } # while (my @result = $sth->fetchrow_array) - } # if ($column->{TYPE} =~ /text|char/i) - } # foreach my $column (@columns) + AND $column != ''" + ); + + my @pk_array = map {"$_ = ?"} split(',', $pk); + my $pk_where = join(' AND ', @pk_array); + my $update_sth = $dbh->prepare("UPDATE $table SET $column = ? WHERE $pk_where"); + + $sth->execute(); + + while (my @result = $sth->fetchrow_array) { + my $data = shift @result; + + # Wide characters cause md5_base64() to die. + my $digest_data = utf8::is_utf8($data) ? Encode::encode_utf8($data) : $data; + my $digest = md5_base64($digest_data); + + my @primary_keys = reverse split(',', $pk); + + # We copy the array so that we can pop things from it without + # affecting the original. + my @pk_data = @result; + my $pk_line = join(', ', map { "$_ = " . pop @pk_data } @primary_keys); + + my $encoding; + if ($switch{'guess'}) { + $encoding = detect_encoding($data); + + # We only show failures if they don't appear to be + # ASCII. + if ($switch{'show-failures'} && !$encoding && !is_valid_utf8($data)) { + my $truncated = trunc($data); + print "Row: [$pk_line]\n", "Failed to guess: Key: $digest", + " DATA: $truncated\n"; + } + + # If we fail a guess, and the data is valid UTF-8, + # just assume we failed because it's UTF-8. + next if is_valid_utf8($data); + } + + # If we couldn't detect the charset (or were instructed + # not to try), we fall back to --charset. If there's no + # fallback, we just do nothing. + if (!$encoding && $switch{'charset'}) { + $encoding = $switch{'charset'}; + } + + $encoding = $overrides{$digest} if $overrides{$digest}; + + # We only fix it if it's not ASCII or UTF-8 already. + if ($encoding && !grep($_ eq $encoding, IGNORE_ENCODINGS)) { + my $decoded = encode('utf8', decode($encoding, $data)); + if ($switch{'dry-run'} && $data ne $decoded) { + print "Row: [$pk_line]\n", "From: [" . trunc($data) . "] Key: $digest\n", + "To: [" . trunc($decoded) . "]", " Encoding : $encoding\n"; + } + else { + $update_sth->execute($decoded, @result); + } + } + } # while (my @result = $sth->fetchrow_array) + } # if ($column->{TYPE} =~ /text|char/i) + } # foreach my $column (@columns) } $dbh->commit; |