All mojibake issues in the RSS feed have now been fixed
After previously converting the full system to UTF-8, we have now also cleaned up the remaining mojibake issues in the RSS feed.
These issues were caused by older character encoding problems, where characters such as Swedish å, ä and ö, quotation marks, and other special characters were displayed incorrectly.
At the same time, we have created a script that can fix these issues directly in the database while the system is running. This means that any remaining or newly discovered mojibake issues can be handled live, without having to manually rebuild the RSS feed.
The RSS feed should now be much cleaner and correctly encoded.
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
mojibake.sh --database DB --user USER --table TABLE --columns col1,col2 [options]
Required:
--database DB Database/schema name
--user USER MySQL/MariaDB user
--table TABLE Table name
--columns col1,col2 Columns to fix, comma-separated
Password:
--pass PASS MySQL/MariaDB password
--ask-pass Ask for password
MYSQL_PWD='secret' Alternative to --pass
Optional:
--host HOST Default: localhost
--port PORT Default: 3306
--pk COLUMN Numeric primary/range column. Auto-detected if possible.
--column COLUMN Add one column. Can be repeated.
--json-columns col1,col2 Treat these columns as JSON text and escape double quotes.
--json-column COLUMN Add one JSON text column. Can be repeated.
--batch-size N Default: 5000
--sample-limit N Default: 2
--sample-batches N Default: 3
--backup-dir DIR Default: ./mysql-backups
--apply Actually update rows. Default is dry-run.
--dry-run Only scan and show progress.
--no-backup Do not create mysqldump backup. Not recommended.
--help Show this help.
Examples:
mojibake.sh --database rss --user root --ask-pass --table content --columns json,title,description --dry-run
mojibake.sh --database rss --user root --ask-pass --table content --columns json,title,description --json-columns json --batch-size 5000 --apply
mojibake.sh --database rss --user root --ask-pass --table content --pk id --columns json,title,description --apply
EOF
}
die() {
echo "Error: $*" >&2
echo >&2
usage >&2
exit 1
}
trim() {
local value="$*"
value="${value#"${value%%[![:space:]]*}"}"
value="${value%"${value##*[![:space:]]}"}"
printf '%s' "$value"
}
quote_ident() {
local value="$1"
value="${value//\`/\`\`}"
printf '`%s`' "$value"
}
sql_string() {
local value="$1"
value="${value//\\/\\\\}"
value="${value//\'/\'\'}"
printf "'%s'" "$value"
}
add_csv_columns() {
local csv="$1"
local target_array_name="$2"
local item
local clean
local items
IFS=',' read -r -a items <<< "$csv"
for item in "${items[@]}"; do
clean="$(trim "$item")"
[[ -z "$clean" ]] && continue
eval "$target_array_name+=(\"\$clean\")"
done
}
join_sql() {
local separator="$1"
shift
local output=""
local item
for item in "$@"; do
if [[ -z "$output" ]]; then
output="$item"
else
output+="$separator$item"
fi
done
printf '%s' "$output"
}
contains_column() {
local needle="$1"
shift
local item
for item in "$@"; do
[[ "$item" == "$needle" ]] && return 0
done
return 1
}
is_integer_type() {
local data_type="$1"
case "$data_type" in
tinyint|smallint|mediumint|int|integer|bigint)
return 0
;;
*)
return 1
;;
esac
}
DATABASE=""
MYSQL_USER_NAME=""
MYSQL_PASSWORD_VALUE=""
MYSQL_PASSWORD_PROVIDED=0
ASK_PASS=0
HOST="localhost"
PORT="3306"
TABLE=""
PK_COLUMN=""
BACKUP_DIR="./mysql-backups"
MODE="dry-run"
DO_BACKUP=1
BATCH_SIZE=5000
SAMPLE_LIMIT=2
SAMPLE_BATCHES=3
FIX_COLUMNS=()
JSON_FIX_COLUMNS=()
if [[ $# -eq 0 ]]; then
usage
exit 1
fi
while [[ $# -gt 0 ]]; do
case "$1" in
--database)
[[ $# -ge 2 ]] || die "Missing value for --database"
DATABASE="$2"
shift 2
;;
--database=*)
DATABASE="${1#*=}"
shift
;;
--user)
[[ $# -ge 2 ]] || die "Missing value for --user"
MYSQL_USER_NAME="$2"
shift 2
;;
--user=*)
MYSQL_USER_NAME="${1#*=}"
shift
;;
--pass|--password)
[[ $# -ge 2 ]] || die "Missing value for --pass"
MYSQL_PASSWORD_VALUE="$2"
MYSQL_PASSWORD_PROVIDED=1
shift 2
;;
--pass=*|--password=*)
MYSQL_PASSWORD_VALUE="${1#*=}"
MYSQL_PASSWORD_PROVIDED=1
shift
;;
--ask-pass)
ASK_PASS=1
shift
;;
--host)
[[ $# -ge 2 ]] || die "Missing value for --host"
HOST="$2"
shift 2
;;
--host=*)
HOST="${1#*=}"
shift
;;
--port)
[[ $# -ge 2 ]] || die "Missing value for --port"
PORT="$2"
shift 2
;;
--port=*)
PORT="${1#*=}"
shift
;;
--table)
[[ $# -ge 2 ]] || die "Missing value for --table"
TABLE="$2"
shift 2
;;
--table=*)
TABLE="${1#*=}"
shift
;;
--pk)
[[ $# -ge 2 ]] || die "Missing value for --pk"
PK_COLUMN="$2"
shift 2
;;
--pk=*)
PK_COLUMN="${1#*=}"
shift
;;
--columns)
[[ $# -ge 2 ]] || die "Missing value for --columns"
add_csv_columns "$2" FIX_COLUMNS
shift 2
;;
--columns=*)
add_csv_columns "${1#*=}" FIX_COLUMNS
shift
;;
--column)
[[ $# -ge 2 ]] || die "Missing value for --column"
FIX_COLUMNS+=("$2")
shift 2
;;
--column=*)
FIX_COLUMNS+=("${1#*=}")
shift
;;
--json-columns)
[[ $# -ge 2 ]] || die "Missing value for --json-columns"
add_csv_columns "$2" JSON_FIX_COLUMNS
shift 2
;;
--json-columns=*)
add_csv_columns "${1#*=}" JSON_FIX_COLUMNS
shift
;;
--json-column)
[[ $# -ge 2 ]] || die "Missing value for --json-column"
JSON_FIX_COLUMNS+=("$2")
shift 2
;;
--json-column=*)
JSON_FIX_COLUMNS+=("${1#*=}")
shift
;;
--batch-size)
[[ $# -ge 2 ]] || die "Missing value for --batch-size"
BATCH_SIZE="$2"
shift 2
;;
--batch-size=*)
BATCH_SIZE="${1#*=}"
shift
;;
--sample-limit)
[[ $# -ge 2 ]] || die "Missing value for --sample-limit"
SAMPLE_LIMIT="$2"
shift 2
;;
--sample-limit=*)
SAMPLE_LIMIT="${1#*=}"
shift
;;
--sample-batches)
[[ $# -ge 2 ]] || die "Missing value for --sample-batches"
SAMPLE_BATCHES="$2"
shift 2
;;
--sample-batches=*)
SAMPLE_BATCHES="${1#*=}"
shift
;;
--backup-dir)
[[ $# -ge 2 ]] || die "Missing value for --backup-dir"
BACKUP_DIR="$2"
shift 2
;;
--backup-dir=*)
BACKUP_DIR="${1#*=}"
shift
;;
--apply)
MODE="apply"
shift
;;
--dry-run)
MODE="dry-run"
shift
;;
--no-backup)
DO_BACKUP=0
shift
;;
--help|-h)
usage
exit 0
;;
*)
die "Unknown option: $1"
;;
esac
done
[[ -n "$DATABASE" ]] || die "--database is required"
[[ -n "$MYSQL_USER_NAME" ]] || die "--user is required"
[[ -n "$TABLE" ]] || die "--table is required"
[[ "${#FIX_COLUMNS[@]}" -gt 0 ]] || die "--columns or --column is required"
[[ "$BATCH_SIZE" =~ ^[0-9]+$ ]] || die "--batch-size must be numeric"
[[ "$SAMPLE_LIMIT" =~ ^[0-9]+$ ]] || die "--sample-limit must be numeric"
[[ "$SAMPLE_BATCHES" =~ ^[0-9]+$ ]] || die "--sample-batches must be numeric"
[[ "$BATCH_SIZE" -gt 0 ]] || die "--batch-size must be greater than 0"
if [[ "$ASK_PASS" -eq 1 ]]; then
read -r -s -p "MySQL password: " MYSQL_PASSWORD_VALUE
echo
MYSQL_PASSWORD_PROVIDED=1
fi
if [[ "$MYSQL_PASSWORD_PROVIDED" -eq 1 ]]; then
export MYSQL_PWD="$MYSQL_PASSWORD_VALUE"
elif [[ -z "${MYSQL_PWD:-}" ]]; then
die "--pass, --ask-pass, or MYSQL_PWD is required"
fi
MYSQL=(
mysql
--default-character-set=utf8mb4
-h "$HOST"
-P "$PORT"
-u "$MYSQL_USER_NAME"
--database="$DATABASE"
--batch
--raw
--silent
)
MYSQLDUMP=(
mysqldump
--default-character-set=utf8mb4
-h "$HOST"
-P "$PORT"
-u "$MYSQL_USER_NAME"
--single-transaction
--quick
)
TEXT_MAP='
c383c2a5 c3a5
c383c2a4 c3a4
c383c2b6 c3b6
c383e280a6 c385
c383c285 c385
c383e2809e c384
c383c284 c384
c383e28093 c396
c383c296 c396
c383c2a9 c3a9
c383c2a8 c3a8
c383c2a1 c3a1
c383c2b3 c3b3
c383c2b2 c3b2
c383c2ba c3ba
c383c2b9 c3b9
c383c2bc c3bc
c383c2b1 c3b1
c383c2b8 c3b8
c383c2a6 c3a6
c383c2a7 c3a7
c383e280b0 c389
c383c289 c389
c383cb86 c388
c383c288 c388
c383c281 c381
c383c293 c393
c383e2809c c393
c383c292 c392
c383e28099 c392
c383c39a c39a
c383c5a1 c39a
c383c39c c39c
c383c593 c39c
c383c291 c391
c383e28098 c391
c383c398 c398
c383cb9c c398
c383c386 c386
c383e280a0 c386
c383c387 c387
c383e280a1 c387
c3a2e282acc593 22
c3a2c280c29c 22
c3a2e282acc29d 22
c3a2c280c29d 22
c3a2e282accb9c 27
c3a2c280c298 27
c3a2e282ace284a2 27
c3a2c280c299 27
c3a2e282ace2809c 2d
c3a2c280c293 2d
c3a2e282ace2809d 2d
c3a2c280c294 2d
c3a2e282acc2a6 2e2e2e
c3a2c280c2a6 2e2e2e
c3a2e282acc2a2 2d
c3a2c280c2a2 2d
c3a2e2809ec2a2 e284a2
c3a2c284c2a2 e284a2
c3a2e2809ac2ac e282ac
c3a2c282c2ac e282ac
c382c2a0 20
c382c2a7 c2a7
c382c2a9 c2a9
c382c2ae c2ae
c382c2b0 c2b0
c382c2b1 c2b1
c382c2ab c2ab
c382c2bb c2bb
'
JSON_MAP='
c383c2a5 c3a5
c383c2a4 c3a4
c383c2b6 c3b6
c383e280a6 c385
c383c285 c385
c383e2809e c384
c383c284 c384
c383e28093 c396
c383c296 c396
c383c2a9 c3a9
c383c2a8 c3a8
c383c2a1 c3a1
c383c2b3 c3b3
c383c2b2 c3b2
c383c2ba c3ba
c383c2b9 c3b9
c383c2bc c3bc
c383c2b1 c3b1
c383c2b8 c3b8
c383c2a6 c3a6
c383c2a7 c3a7
c383e280b0 c389
c383c289 c389
c383cb86 c388
c383c288 c388
c383c281 c381
c383c293 c393
c383e2809c c393
c383c292 c392
c383e28099 c392
c383c39a c39a
c383c5a1 c39a
c383c39c c39c
c383c593 c39c
c383c291 c391
c383e28098 c391
c383c398 c398
c383cb9c c398
c383c386 c386
c383e280a0 c386
c383c387 c387
c383e280a1 c387
c3a2e282acc593 5c22
c3a2c280c29c 5c22
c3a2e282acc29d 5c22
c3a2c280c29d 5c22
c3a2e282accb9c 27
c3a2c280c298 27
c3a2e282ace284a2 27
c3a2c280c299 27
c3a2e282ace2809c 2d
c3a2c280c293 2d
c3a2e282ace2809d 2d
c3a2c280c294 2d
c3a2e282acc2a6 2e2e2e
c3a2c280c2a6 2e2e2e
c3a2e282acc2a2 2d
c3a2c280c2a2 2d
c3a2e2809ec2a2 e284a2
c3a2c284c2a2 e284a2
c3a2e2809ac2ac e282ac
c3a2c282c2ac e282ac
c382c2a0 20
c382c2a7 c2a7
c382c2a9 c2a9
c382c2ae c2ae
c382c2b0 c2b0
c382c2b1 c2b1
c382c2ab c2ab
c382c2bb c2bb
'
build_replace_expr() {
local column="$1"
local map="$2"
local quoted_column
local expr
local bad_hex
local good_hex
quoted_column="$(quote_ident "$column")"
expr="CAST(${quoted_column} AS CHAR CHARACTER SET utf8mb4)"
while read -r bad_hex good_hex; do
[[ -z "${bad_hex:-}" ]] && continue
expr="REPLACE(${expr}, CONVERT(0x${bad_hex} USING utf8mb4), CONVERT(0x${good_hex} USING utf8mb4))"
done <<< "$map"
printf '%s' "$expr"
}
build_column_where() {
local column="$1"
local map="$2"
local quoted_column
local expr
local bad_hex
local good_hex
quoted_column="$(quote_ident "$column")"
expr="CAST(${quoted_column} AS CHAR CHARACTER SET utf8mb4)"
while read -r bad_hex good_hex; do
[[ -z "${bad_hex:-}" ]] && continue
printf 'LOCATE(CONVERT(0x%s USING utf8mb4), %s) > 0\n' "$bad_hex" "$expr"
done <<< "$map"
}
column_data_type() {
local column="$1"
"${MYSQL[@]}" -N -e "
SELECT DATA_TYPE
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = $(sql_string "$DATABASE")
AND TABLE_NAME = $(sql_string "$TABLE")
AND COLUMN_NAME = $(sql_string "$column")
LIMIT 1;
"
}
detect_single_numeric_pk() {
local pk_count
local pk_name
local pk_type
pk_count="$("${MYSQL[@]}" -N -e "
SELECT COUNT(*)
FROM information_schema.KEY_COLUMN_USAGE
WHERE TABLE_SCHEMA = $(sql_string "$DATABASE")
AND TABLE_NAME = $(sql_string "$TABLE")
AND CONSTRAINT_NAME = 'PRIMARY';
")"
[[ "$pk_count" == "1" ]] || return 1
read -r pk_name pk_type < <("${MYSQL[@]}" -N -e "
SELECT k.COLUMN_NAME, c.DATA_TYPE
FROM information_schema.KEY_COLUMN_USAGE k
JOIN information_schema.COLUMNS c
ON c.TABLE_SCHEMA = k.TABLE_SCHEMA
AND c.TABLE_NAME = k.TABLE_NAME
AND c.COLUMN_NAME = k.COLUMN_NAME
WHERE k.TABLE_SCHEMA = $(sql_string "$DATABASE")
AND k.TABLE_NAME = $(sql_string "$TABLE")
AND k.CONSTRAINT_NAME = 'PRIMARY'
LIMIT 1;
")
if is_integer_type "$pk_type"; then
printf '%s' "$pk_name"
return 0
fi
return 1
}
echo "Checking connection and table metadata..."
"${MYSQL[@]}" -e "SET NAMES utf8mb4; SELECT 1;" >/dev/null
if [[ -z "$PK_COLUMN" ]]; then
if PK_COLUMN="$(detect_single_numeric_pk)"; then
echo "Range column: $PK_COLUMN (auto-detected primary key)"
else
die "Could not auto-detect a single numeric primary key. Use --pk id or another indexed numeric column."
fi
else
pk_type="$(column_data_type "$PK_COLUMN")"
[[ -n "$pk_type" ]] || die "PK/range column does not exist: ${DATABASE}.${TABLE}.${PK_COLUMN}"
is_integer_type "$pk_type" || die "--pk column must be numeric/integer, got: $PK_COLUMN ($pk_type)"
echo "Range column: $PK_COLUMN (manual)"
fi
TARGET="$(quote_ident "$DATABASE").$(quote_ident "$TABLE")"
QUOTED_PK="$(quote_ident "$PK_COLUMN")"
SET_PARTS=()
WHERE_PARTS=()
SAMPLE_COLUMNS=()
for column in "${FIX_COLUMNS[@]}"; do
column="$(trim "$column")"
[[ -z "$column" ]] && continue
data_type="$(column_data_type "$column")"
[[ -n "$data_type" ]] || die "Column does not exist: ${DATABASE}.${TABLE}.${column}"
map="$TEXT_MAP"
mode_label="text"
if [[ "$data_type" == "json" ]] || contains_column "$column" "${JSON_FIX_COLUMNS[@]}"; then
map="$JSON_MAP"
mode_label="json"
fi
echo "Column: $column ($data_type, $mode_label mode)"
SET_PARTS+=("$(quote_ident "$column") = $(build_replace_expr "$column" "$map")")
SAMPLE_COLUMNS+=("LEFT(CAST($(quote_ident "$column") AS CHAR CHARACTER SET utf8mb4), 220) AS $(quote_ident "$column")")
while IFS= read -r condition; do
[[ -z "$condition" ]] && continue
WHERE_PARTS+=("$condition")
done < <(build_column_where "$column" "$map")
done
[[ "${#SET_PARTS[@]}" -gt 0 ]] || die "No usable columns found"
[[ "${#WHERE_PARTS[@]}" -gt 0 ]] || die "No replacement map loaded"
SET_SQL="$(join_sql $',\n ' "${SET_PARTS[@]}")"
WHERE_SQL="$(join_sql $' OR\n ' "${WHERE_PARTS[@]}")"
SAMPLE_SQL="$(join_sql $',\n ' "${SAMPLE_COLUMNS[@]}")"
read -r PK_MIN PK_MAX < <("${MYSQL[@]}" -N -e "
SET NAMES utf8mb4;
SELECT COALESCE(MIN(${QUOTED_PK}), 0), COALESCE(MAX(${QUOTED_PK}), 0)
FROM ${TARGET};
")
if [[ "$PK_MIN" == "0" && "$PK_MAX" == "0" ]]; then
echo "Table appears empty. Nothing to do."
exit 0
fi
TOTAL_SPAN=$((PK_MAX - PK_MIN + 1))
if [[ "$TOTAL_SPAN" -le 0 ]]; then
die "Invalid range: min=$PK_MIN max=$PK_MAX"
fi
echo
echo "Mode: $MODE"
echo "Target: ${DATABASE}.${TABLE}"
echo "Range: ${PK_COLUMN} ${PK_MIN}..${PK_MAX}"
echo "Batch size: $BATCH_SIZE"
echo
if [[ "$MODE" == "apply" && "$DO_BACKUP" -eq 1 ]]; then
mkdir -p "$BACKUP_DIR"
BACKUP_FILE="${BACKUP_DIR}/${DATABASE}.${TABLE}.before-mojibake-fix.$(date +%Y%m%d-%H%M%S).sql"
echo "Creating backup: $BACKUP_FILE"
if command -v pv >/dev/null 2>&1; then
"${MYSQLDUMP[@]}" "$DATABASE" "$TABLE" | pv -b -r -t > "$BACKUP_FILE"
else
echo "Tip: install pv if you want byte progress during backup."
"${MYSQLDUMP[@]}" "$DATABASE" "$TABLE" > "$BACKUP_FILE"
fi
echo "Backup done."
echo
elif [[ "$MODE" == "apply" ]]; then
echo "Backup disabled with --no-backup. Continuing without dump."
echo
fi
CURRENT_START="$PK_MIN"
BATCH_NO=0
TOTAL_MATCHES=0
TOTAL_CHANGED=0
SAMPLES_SHOWN=0
while [[ "$CURRENT_START" -le "$PK_MAX" ]]; do
CURRENT_END=$((CURRENT_START + BATCH_SIZE - 1))
if [[ "$CURRENT_END" -gt "$PK_MAX" ]]; then
CURRENT_END="$PK_MAX"
fi
BATCH_NO=$((BATCH_NO + 1))
PROCESSED_SPAN=$((CURRENT_END - PK_MIN + 1))
PERCENT=$((PROCESSED_SPAN * 100 / TOTAL_SPAN))
BATCH_WHERE="
${QUOTED_PK} BETWEEN ${CURRENT_START} AND ${CURRENT_END}
AND (
${WHERE_SQL}
)
"
MATCHES="$("${MYSQL[@]}" -N -e "
SET NAMES utf8mb4;
SELECT COUNT(*)
FROM ${TARGET}
WHERE ${BATCH_WHERE};
")"
TOTAL_MATCHES=$((TOTAL_MATCHES + MATCHES))
if [[ "$MODE" == "apply" ]]; then
CHANGED="$("${MYSQL[@]}" -N -e "
SET NAMES utf8mb4;
START TRANSACTION;
UPDATE ${TARGET}
SET
${SET_SQL}
WHERE ${BATCH_WHERE};
SELECT ROW_COUNT();
COMMIT;
")"
TOTAL_CHANGED=$((TOTAL_CHANGED + CHANGED))
printf '[%s] Batch %d | %s=%s..%s | %3d%% | matches=%s | changed=%s | total_changed=%s\n' \
"$(date '+%H:%M:%S')" \
"$BATCH_NO" \
"$PK_COLUMN" \
"$CURRENT_START" \
"$CURRENT_END" \
"$PERCENT" \
"$MATCHES" \
"$CHANGED" \
"$TOTAL_CHANGED"
else
printf '[%s] Batch %d | %s=%s..%s | %3d%% | matches=%s | total_matches=%s\n' \
"$(date '+%H:%M:%S')" \
"$BATCH_NO" \
"$PK_COLUMN" \
"$CURRENT_START" \
"$CURRENT_END" \
"$PERCENT" \
"$MATCHES" \
"$TOTAL_MATCHES"
fi
if [[ "$MATCHES" -gt 0 && "$SAMPLES_SHOWN" -lt "$SAMPLE_BATCHES" && "$SAMPLE_LIMIT" -gt 0 ]]; then
SAMPLES_SHOWN=$((SAMPLES_SHOWN + 1))
echo "Sample batch $SAMPLES_SHOWN:"
"${MYSQL[@]}" -e "
SET NAMES utf8mb4;
SELECT
${QUOTED_PK} AS pk,
${SAMPLE_SQL}
FROM ${TARGET}
WHERE ${BATCH_WHERE}
LIMIT ${SAMPLE_LIMIT};
"
echo
fi
CURRENT_START=$((CURRENT_END + 1))
done
echo
echo "Finished."
echo "Total matching rows seen: $TOTAL_MATCHES"
if [[ "$MODE" == "apply" ]]; then
echo "Total changed rows: $TOTAL_CHANGED"
else
echo "Dry-run only. Re-run with --apply to update data."
fi
