2 files changed, 85 insertions, 0 deletions
diff --git a/compilerplugins/clang/stringliteralvar.cxx b/compilerplugins/clang/stringliteralvar.cxx
index 5ace384f1e16..fcd3690669e7 100644
--- a/compilerplugins/clang/stringliteralvar.cxx
+++ b/compilerplugins/clang/stringliteralvar.cxx
@@ -28,6 +28,7 @@
 #include <cassert>
 
 #include "check.hxx"
+#include "compat.hxx"
 #include "plugin.hxx"
 
 namespace
@@ -137,6 +138,10 @@ public:
                     return true;
                 }
                 auto const d = e1->getDecl();
+                if (isPotentiallyInitializedWithMalformedUtf16(d))
+                {
+                    return true;
+                }
                 if (!reportedArray_.insert(d).second)
                 {
                     return true;
@@ -188,6 +193,10 @@ public:
             return true;
         }
         auto const d = e->getDecl();
+        if (isPotentiallyInitializedWithMalformedUtf16(d))
+        {
+            return true;
+        }
         if (!reportedArray_.insert(d).second)
         {
             return true;
@@ -246,6 +255,61 @@ private:
         }
     }
 
+    // There is some confusion on the semantics of numeric-escape-sequences in string literals, see
+    // <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2029r4.html> "Proposed resolution
+    // for core issues 411, 1656, and 2333; numeric and universal character escapes in character and
+    // string literals", so suppress warnings about arrays that are deliberately not written as
+    // UTF-16 string literals because they contain lone surrogates:
+    bool isPotentiallyInitializedWithMalformedUtf16(ValueDecl const* decl) const
+    {
+        if (!decl->getType()->getArrayElementTypeNoTypeQual()->isChar16Type())
+        {
+            return false;
+        }
+        auto const init = cast<VarDecl>(decl)->getAnyInitializer();
+        if (init == nullptr)
+        {
+            return true;
+        }
+        auto const list = dyn_cast<InitListExpr>(init);
+        if (list == nullptr)
+        {
+            // Assuming that the initializer already is a string literal, assume that that string
+            // literal has no issues with malformed UTF-16:
+            if (isDebugMode())
+            {
+                assert(isa<clang::StringLiteral>(init));
+            }
+            return false;
+        }
+        auto highSurrogate = false;
+        for (auto const e : list->inits())
+        {
+            llvm::APSInt v;
+            if (!compat::EvaluateAsInt(e, v, compiler.getASTContext()))
+            {
+                return true;
+            }
+            if (highSurrogate)
+            {
+                if (v < 0xDC00 || v > 0xDFFF)
+                {
+                    return true;
+                }
+                highSurrogate = false;
+            }
+            else if (v >= 0xD800 && v <= 0xDBFF)
+            {
+                highSurrogate = true;
+            }
+            else if (v >= 0xDC00 && v <= 0xDFFF)
+            {
+                return true;
+            }
+        }
+        return highSurrogate;
+    }
+
     std::set<Decl const*> reportedAutomatic_;
     std::set<Decl const*> reportedArray_;
 };
diff --git a/compilerplugins/clang/test/stringliteralvar.cxx b/compilerplugins/clang/test/stringliteralvar.cxx
index 3c0eaaccae04..d0fdcedb0668 100644
--- a/compilerplugins/clang/test/stringliteralvar.cxx
+++ b/compilerplugins/clang/test/stringliteralvar.cxx
@@ -110,4 +110,25 @@ void f11(int nStreamType)
     (void)sStreamType;
 }
 
+extern sal_Unicode const extarr[1];
+
+sal_Unicode init();
+
+void f12()
+{
+    // Suppress warnings if the array contains a malformed sequence of UTF-16 code units...:
+    static sal_Unicode const arr1[] = { 0xD800 };
+    f(OUString(arr1, 1));
+    // ...Or potentially contains a malformed sequence of UTF-16 code units...:
+    f(OUString(extarr, 1));
+    sal_Unicode const arr2[] = { init() };
+    f(OUString(arr2, 1));
+    // ...But generate a warning if the array contains a well-formed sequence of UTF-16 code units
+    // containing surrogates:
+    // expected-error-re@+1 {{change type of variable 'arr3' from constant character array ('const sal_Unicode{{ ?}}[2]'{{( \(aka 'const char16_t\[2\]'\))?}}) to OUStringLiteral [loplugin:stringliteralvar]}}
+    static sal_Unicode const arr3[] = { 0xD800, 0xDC00 };
+    // expected-note-re@+1 {{first passed into a '{{(rtl::)?}}OUString' constructor here [loplugin:stringliteralvar]}}
+    f(OUString(arr3, 2));
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */