F203/Libraries/utf8_to_cp1251.c

230 lines
8.4 KiB
C
Raw Permalink Normal View History

2023-12-13 16:59:07 +03:00
/* Библиотека на языке C для конвертации строк из кодировки UTF-8 в CP1251
* Адрес проекта: https://github.com/bravikov/utf8_to_cp1251
* */
#include "utf8_to_cp1251.h"
#include "bit.h"
#include <stdbool.h>
#ifndef UTF8_TO_CP1251_LINEAR_SEARCH
#include <stdlib.h>
#endif
/* Таблица преобразования кодов. Отсортирована по возрастанию кодов Unicode
* для двоичного поиска. */
static const Cp1251 cp1251Table[] = {
/* {0x98, 0x0098}, */
{0xA0, 0x00A0}, /* NO-BREAK SPACE */
{0xA4, 0x00A4}, /* CURRENCY SIGN */
{0xA6, 0x00A6}, /* BROKEN BAR */
{0xA7, 0x00A7}, /* SECTION SIGN */
{0xA9, 0x00A9}, /* COPYRIGHT SIGN */
{0xAB, 0x00AB}, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
{0xAC, 0x00AC}, /* NOT SIGN */
{0xAD, 0x00AD}, /* SOFT HYPHEN */
{0xAE, 0x00AE}, /* REGISTERED SIGN */
{0xB0, 0x00B0}, /* DEGREE SIGN */
{0xB1, 0x00B1}, /* PLUS-MINUS SIGN */
{0xB5, 0x00B5}, /* MICRO SIGN */
{0xB6, 0x00B6}, /* PILCROW SIGN */
{0xB7, 0x00B7}, /* MIDDLE DOT */
{0xBB, 0x00BB}, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
{0xA8, 0x0401}, /* CYRILLIC CAPITAL LETTER IO */
{0x80, 0x0402}, /* CYRILLIC CAPITAL LETTER DJE */
{0x81, 0x0403}, /* CYRILLIC CAPITAL LETTER GJE */
{0xAA, 0x0404}, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
{0xBD, 0x0405}, /* CYRILLIC CAPITAL LETTER DZE */
{0xB2, 0x0406}, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
{0xAF, 0x0407}, /* CYRILLIC CAPITAL LETTER YI */
{0xA3, 0x0408}, /* CYRILLIC CAPITAL LETTER JE */
{0x8A, 0x0409}, /* CYRILLIC CAPITAL LETTER LJE */
{0x8C, 0x040A}, /* CYRILLIC CAPITAL LETTER NJE */
{0x8E, 0x040B}, /* CYRILLIC CAPITAL LETTER TSHE */
{0x8D, 0x040C}, /* CYRILLIC CAPITAL LETTER KJE */
{0xA1, 0x040E}, /* CYRILLIC CAPITAL LETTER SHORT U */
{0x8F, 0x040F}, /* CYRILLIC CAPITAL LETTER DZHE */
{0xB8, 0x0451}, /* CYRILLIC SMALL LETTER IO */
{0x90, 0x0452}, /* CYRILLIC SMALL LETTER DJE */
{0x83, 0x0453}, /* CYRILLIC SMALL LETTER GJE */
{0xBA, 0x0454}, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
{0xBE, 0x0455}, /* CYRILLIC SMALL LETTER DZE */
{0xB3, 0x0456}, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
{0xBF, 0x0457}, /* CYRILLIC SMALL LETTER YI */
{0xBC, 0x0458}, /* CYRILLIC SMALL LETTER JE */
{0x9A, 0x0459}, /* CYRILLIC SMALL LETTER LJE */
{0x9C, 0x045A}, /* CYRILLIC SMALL LETTER NJE */
{0x9E, 0x045B}, /* CYRILLIC SMALL LETTER TSHE */
{0x9D, 0x045C}, /* CYRILLIC SMALL LETTER KJE */
{0xA2, 0x045E}, /* CYRILLIC SMALL LETTER SHORT U */
{0x9F, 0x045F}, /* CYRILLIC SMALL LETTER DZHE */
{0xA5, 0x0490}, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */
{0xB4, 0x0491}, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */
{0x96, 0x2013}, /* EN DASH */
{0x97, 0x2014}, /* EM DASH */
{0x91, 0x2018}, /* LEFT SINGLE QUOTATION MARK */
{0x92, 0x2019}, /* RIGHT SINGLE QUOTATION MARK */
{0x82, 0x201A}, /* SINGLE LOW-9 QUOTATION MARK */
{0x93, 0x201C}, /* LEFT DOUBLE QUOTATION MARK */
{0x94, 0x201D}, /* RIGHT DOUBLE QUOTATION MARK */
{0x84, 0x201E}, /* DOUBLE LOW-9 QUOTATION MARK */
{0x86, 0x2020}, /* DAGGER */
{0x87, 0x2021}, /* DOUBLE DAGGER */
{0x95, 0x2022}, /* BULLET */
{0x85, 0x2026}, /* HORIZONTAL ELLIPSIS */
{0x89, 0x2030}, /* PER MILLE SIGN */
{0x8B, 0x2039}, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
{0x9B, 0x203A}, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
{0x88, 0x20AC}, /* EURO SIGN */
{0xB9, 0x2116}, /* NUMERO SIGN */
{0x99, 0x2122} /* TRADE MARK SIGN */
};
static const Cp1251 * customCp1251Table = 0;
static size_t customCp1251TableSize = 0;
void setCustomCp1251Table(const Cp1251 * table, const size_t count) {
customCp1251Table = table;
customCp1251TableSize = count;
}
#ifndef UTF8_TO_CP1251_LINEAR_SEARCH
/* Функция сравнения для двоичного поиска. */
static int compareConversion (const void * s1, const void * s2) {
return ((Cp1251 *)s1)->unicode - ((Cp1251 *)s2)->unicode;
}
#endif
#define MAX_NUMBER_OF_UTF8_OCTETS (6)
int convertUtf8ToCp1251(const char * utf8, char * cp1251)
{
const uint8_t firstOctetMask[MAX_NUMBER_OF_UTF8_OCTETS] = {
b1000_0000,
b1110_0000,
b1111_0000,
b1111_1000,
b1111_1100,
b1111_1110
};
const uint8_t firstOctetTemplate[MAX_NUMBER_OF_UTF8_OCTETS] = {
b0000_0000,
b1100_0000,
b1110_0000,
b1111_0000,
b1111_1000,
b1111_1100
};
const uint8_t secondOctetMask = b1100_0000;
const uint8_t secondOctetTemplate = b1000_0000;
size_t numberOfRemainingOctets = 0;
uint32_t unicode = 0;
size_t i = 0;
size_t cp1251_i = 0;
while (utf8[i] != '\0') {
const uint8_t octet = utf8[i++];
if (numberOfRemainingOctets == 0) {
bool error = true;
size_t j;
for (j = 0; j < MAX_NUMBER_OF_UTF8_OCTETS; j++) {
const uint8_t octetMask = firstOctetMask[j];
const uint8_t octetTemplate = firstOctetTemplate[j];
if ((octet & octetMask) == octetTemplate) {
unicode = octet & ~octetMask;
if (j == 0) {
/* Обнаружен символ US-ASCII */
cp1251[cp1251_i++] = unicode;
}
numberOfRemainingOctets = j;
error = false;
break;
}
}
if (error) {
/* Ошибка UTF-8. */
return -1;
}
}
else {
if ((octet & secondOctetMask) == secondOctetTemplate) {
unicode <<= 6;
unicode |= octet & ~secondOctetMask;
numberOfRemainingOctets--;
if (numberOfRemainingOctets == 0) {
if (0x410 <= unicode && unicode <= 0x44F) {
cp1251[cp1251_i++] = 0xC0 + unicode - 0x410;
}
else {
const Cp1251 * tables[] = {
customCp1251Table,
cp1251Table
};
const size_t tableSizes[] = {
customCp1251TableSize,
sizeof(cp1251Table) / sizeof(Cp1251),
};
size_t t;
# ifdef UTF8_TO_CP1251_LINEAR_SEARCH
bool found = false;
# endif
for (t = 0; t < 2; t++) {
# ifdef UTF8_TO_CP1251_LINEAR_SEARCH
/* Линейный (последовательный) поиск */
size_t u;
for (u = 0; u < tableSizes[t]; u++) {
if (unicode == tables[t][u].unicode) {
cp1251[cp1251_i++] =
tables[t][u].cp1251;
found = true;
break;
}
}
if (found) {
break;
}
# else
/* Двоичный поиск */
const Cp1251 key = {0, unicode};
const Cp1251 * conversion = bsearch(
&key,
tables[t],
tableSizes[t],
sizeof(Cp1251),
compareConversion
);
if (conversion) {
cp1251[cp1251_i++] = conversion->cp1251;
break;
}
# endif
}
}
}
}
else {
/* Ошибка UTF-8. */
return -1;
}
}
}
cp1251[cp1251_i] = '\0';
return cp1251_i;
}