should be it
This commit is contained in:
81
external/duckdb/third_party/brotli/enc/utf8_util.cpp
vendored
Normal file
81
external/duckdb/third_party/brotli/enc/utf8_util.cpp
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
/* Copyright 2013 Google Inc. All Rights Reserved.
|
||||
|
||||
Distributed under MIT license.
|
||||
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
||||
*/
|
||||
|
||||
/* Heuristics for deciding about the UTF8-ness of strings. */
|
||||
|
||||
#include "utf8_util.h"
|
||||
|
||||
#include <brotli/types.h>
|
||||
|
||||
using namespace duckdb_brotli;
|
||||
|
||||
static size_t BrotliParseAsUTF8(
|
||||
int* symbol, const uint8_t* input, size_t size) {
|
||||
/* ASCII */
|
||||
if ((input[0] & 0x80) == 0) {
|
||||
*symbol = input[0];
|
||||
if (*symbol > 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
/* 2-byte UTF8 */
|
||||
if (size > 1u &&
|
||||
(input[0] & 0xE0) == 0xC0 &&
|
||||
(input[1] & 0xC0) == 0x80) {
|
||||
*symbol = (((input[0] & 0x1F) << 6) |
|
||||
(input[1] & 0x3F));
|
||||
if (*symbol > 0x7F) {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
/* 3-byte UFT8 */
|
||||
if (size > 2u &&
|
||||
(input[0] & 0xF0) == 0xE0 &&
|
||||
(input[1] & 0xC0) == 0x80 &&
|
||||
(input[2] & 0xC0) == 0x80) {
|
||||
*symbol = (((input[0] & 0x0F) << 12) |
|
||||
((input[1] & 0x3F) << 6) |
|
||||
(input[2] & 0x3F));
|
||||
if (*symbol > 0x7FF) {
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
/* 4-byte UFT8 */
|
||||
if (size > 3u &&
|
||||
(input[0] & 0xF8) == 0xF0 &&
|
||||
(input[1] & 0xC0) == 0x80 &&
|
||||
(input[2] & 0xC0) == 0x80 &&
|
||||
(input[3] & 0xC0) == 0x80) {
|
||||
*symbol = (((input[0] & 0x07) << 18) |
|
||||
((input[1] & 0x3F) << 12) |
|
||||
((input[2] & 0x3F) << 6) |
|
||||
(input[3] & 0x3F));
|
||||
if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
/* Not UTF8, emit a special symbol above the UTF8-code space */
|
||||
*symbol = 0x110000 | input[0];
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
|
||||
BROTLI_BOOL duckdb_brotli::BrotliIsMostlyUTF8(
|
||||
const uint8_t* data, const size_t pos, const size_t mask,
|
||||
const size_t length, const double min_fraction) {
|
||||
size_t size_utf8 = 0;
|
||||
size_t i = 0;
|
||||
while (i < length) {
|
||||
int symbol;
|
||||
size_t bytes_read =
|
||||
BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
|
||||
i += bytes_read;
|
||||
if (symbol < 0x110000) size_utf8 += bytes_read;
|
||||
}
|
||||
return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user