summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKohei Yoshida <kohei.yoshida@gmail.com>2022-09-14 21:41:44 -0400
committerKohei Yoshida <kohei.yoshida@gmail.com>2022-09-14 21:41:44 -0400
commit792d3c601aefb6b1a32d3918f673380a893ccfaf (patch)
tree60913e6ebbd4a770f2c273590b14595666bf442e
parent902b27227df7154655bb1d6733c6b1b97ef8b804 (diff)
downloadorcus-bugfix/36-utf8-in-flat-output.tar.gz
Add calc_logical_string_length() public function. bugfix/36-utf8-in-flat-output
-rw-r--r--include/orcus/stream.hpp8
-rw-r--r--src/parser/Makefile.am4
-rw-r--r--src/parser/stream.cpp34
-rw-r--r--src/parser/stream_test.cpp32
4 files changed, 77 insertions, 1 deletions
diff --git a/include/orcus/stream.hpp b/include/orcus/stream.hpp
index 84016317..1e24942f 100644
--- a/include/orcus/stream.hpp
+++ b/include/orcus/stream.hpp
@@ -163,6 +163,14 @@ ORCUS_PSR_DLLPUBLIC line_with_offset locate_line_with_offset(std::string_view st
*/
ORCUS_PSR_DLLPUBLIC size_t locate_first_different_char(std::string_view left, std::string_view right);
+/**
+ * Calculate the logical length of a UTF-8 encoded string.
+ *
+ * @param s string to calculate the logical length of.
+ * @return logical length of the UTF-8 encoded string.
+ */
+ORCUS_PSR_DLLPUBLIC std::size_t calc_logical_string_length(std::string_view s);
+
} // namespace orcus
#endif
diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am
index 21cde5fb..708f8423 100644
--- a/src/parser/Makefile.am
+++ b/src/parser/Makefile.am
@@ -190,7 +190,9 @@ parser_test_threaded_json_parser_CPPFLAGS = $(AM_CPPFLAGS)
parser_test_stream_SOURCES = \
stream_test.cpp
-parser_test_stream_LDADD = liborcus-parser-@ORCUS_API_VERSION@.la
+parser_test_stream_LDADD = \
+ liborcus-parser-@ORCUS_API_VERSION@.la \
+ ../test/liborcus-test.a
parser_test_stream_CPPFLAGS = $(AM_CPPFLAGS)
# parser-test-zip-archive
diff --git a/src/parser/stream.cpp b/src/parser/stream.cpp
index 0af9e7fc..889258b8 100644
--- a/src/parser/stream.cpp
+++ b/src/parser/stream.cpp
@@ -8,6 +8,8 @@
#include <orcus/stream.hpp>
#include <orcus/exception.hpp>
+#include "utf8.hpp"
+
#include <sstream>
#include <fstream>
#include <tuple>
@@ -409,6 +411,38 @@ size_t locate_first_different_char(std::string_view left, std::string_view right
return n;
}
+std::size_t calc_logical_string_length(std::string_view s)
+{
+ std::size_t length = 0;
+
+ const char* p = s.data();
+ const char* p_end = p + s.size();
+
+ while (p < p_end)
+ {
+ ++length;
+
+ auto n_bytes = calc_utf8_byte_length(*p);
+ if (!n_bytes || n_bytes > 4)
+ {
+ std::ostringstream os;
+ os << "'" << s << "' contains invalid character at position " << std::distance(s.data(), p);
+ throw std::invalid_argument(os.str());
+ }
+
+ p += n_bytes;
+ }
+
+ if (p != p_end)
+ {
+ std::ostringstream os;
+ os << "last character of '" << s << "' ended prematurely";
+ throw std::invalid_argument(os.str());
+ }
+
+ return length;
+}
+
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/src/parser/stream_test.cpp b/src/parser/stream_test.cpp
index 147e2d3d..3303109f 100644
--- a/src/parser/stream_test.cpp
+++ b/src/parser/stream_test.cpp
@@ -17,6 +17,8 @@ using namespace orcus;
void test_stream_create_error_output()
{
+ test::stack_printer __sp__(__func__);
+
string output = create_parse_error_output("{}", 1);
cout << output << endl;
const char* expected = "1:2: {}\n ^";
@@ -25,6 +27,8 @@ void test_stream_create_error_output()
void test_stream_locate_first_different_char()
{
+ test::stack_printer __sp__(__func__);
+
struct test_case
{
const char* left;
@@ -50,10 +54,38 @@ void test_stream_locate_first_different_char()
}
}
+void test_stream_logical_string_length()
+{
+ test::stack_printer __sp__(__func__);
+
+ struct check
+ {
+ std::string_view value;
+ std::size_t length;
+ };
+
+ constexpr check checks[] = {
+ { "東京", 2 },
+ { "大阪は暑い", 5 },
+ { "New York", 8 },
+ { "日本は英語で言うとJapan", 14 },
+ { "fabriqué", 8 },
+ { "garçon", 6 },
+ };
+
+ for (auto [value, expected_len] : checks)
+ {
+ std::size_t len = calc_logical_string_length(value);
+ std::cout << "'" << value << "' (length=" << len << ")" << std::endl;
+ assert(len == expected_len);
+ }
+}
+
int main()
{
test_stream_create_error_output();
test_stream_locate_first_different_char();
+ test_stream_logical_string_length();
return EXIT_SUCCESS;
}