diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java index 830b40d255..59a7c171dc 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java @@ -131,6 +131,8 @@ public RexNode visitLiteral(Literal node, CalcitePlanContext context) { case NULL: return rexBuilder.makeNullLiteral(typeFactory.createSqlType(SqlTypeName.NULL)); case STRING: + // saffron.properties sets calcite.default.charset=UTF-8 so non-ASCII characters + // (e.g. Chinese, Arabic) are accepted and literal types stay compatible with column types. if (value.toString().length() == 1) { // To align Spark/PostgreSQL, Char(1) is useful, such as cast('1' to boolean) should // return true diff --git a/core/src/main/resources/saffron.properties b/core/src/main/resources/saffron.properties new file mode 100644 index 0000000000..db44a8a224 --- /dev/null +++ b/core/src/main/resources/saffron.properties @@ -0,0 +1,7 @@ +# Shift Calcite's default charset from ISO-8859-1 to UTF-8 so that: +# 1. Non-ASCII PPL string literals (Chinese, Arabic, etc.) are accepted without error. +# 2. Plan-string representations (used in unit-test assertions) are unchanged, +# because Calcite suppresses the CHARACTER SET annotation and _charset prefix +# whenever the charset matches this "default" value. +calcite.default.charset=UTF-8 +calcite.default.collation.name=UTF-8$en_US diff --git a/core/src/test/java/org/opensearch/sql/calcite/CalciteRexNodeVisitorTest.java b/core/src/test/java/org/opensearch/sql/calcite/CalciteRexNodeVisitorTest.java index 9be542f208..558c3b8d9d 100644 --- a/core/src/test/java/org/opensearch/sql/calcite/CalciteRexNodeVisitorTest.java +++ b/core/src/test/java/org/opensearch/sql/calcite/CalciteRexNodeVisitorTest.java @@ -13,9 +13,12 @@ import java.sql.Connection; import java.util.List; import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.type.ArraySqlType; import org.apache.calcite.sql.type.SqlTypeName; +import org.opensearch.sql.ast.expression.DataType; +import org.opensearch.sql.ast.expression.Literal; import org.apache.calcite.tools.FrameworkConfig; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -76,6 +79,46 @@ public void tearDown() { mockedStatic.close(); } + @Test + public void testVisitLiteralNonAsciiStringDoesNotThrow() { + // Regression test for https://github.com/opensearch-project/OpenSearch/issues/21880 + // Chinese (and other non-Latin) string literals must not throw CalciteException when + // visitLiteral builds them via RexBuilder.makeLiteral. + // context.rexBuilder is a real ExtendedRexBuilder backed by TYPE_FACTORY (the mock + // only supplies TYPE_FACTORY through getTypeFactory()), so this exercises the real + // Calcite NlsString / makeLiteral code path. + CalciteRexNodeVisitor realVisitor = new CalciteRexNodeVisitor(relNodeVisitor); + CalcitePlanContext realContext = + CalcitePlanContext.create(frameworkConfig, SysLimit.DEFAULT, QueryType.PPL); + + Literal chineseLiteral = new Literal("未处置", DataType.STRING); + Literal arabicLiteral = new Literal("مرحبا", DataType.STRING); + Literal singleCharLiteral = new Literal("中", DataType.STRING); + + // VARCHAR multi-char: must not throw and must carry UTF-8 charset + RexNode chineseNode = realVisitor.visitLiteral(chineseLiteral, realContext); + assertNotNull(chineseNode); + assertInstanceOf(RexLiteral.class, chineseNode); + assertEquals(SqlTypeName.VARCHAR, chineseNode.getType().getSqlTypeName()); + assertEquals( + java.nio.charset.StandardCharsets.UTF_8, chineseNode.getType().getCharset()); + + RexNode arabicNode = realVisitor.visitLiteral(arabicLiteral, realContext); + assertNotNull(arabicNode); + assertInstanceOf(RexLiteral.class, arabicNode); + assertEquals(SqlTypeName.VARCHAR, arabicNode.getType().getSqlTypeName()); + assertEquals( + java.nio.charset.StandardCharsets.UTF_8, arabicNode.getType().getCharset()); + + // CHAR(1): single non-ASCII character must also carry UTF-8 charset + RexNode singleCharNode = realVisitor.visitLiteral(singleCharLiteral, realContext); + assertNotNull(singleCharNode); + assertInstanceOf(RexLiteral.class, singleCharNode); + assertEquals(SqlTypeName.CHAR, singleCharNode.getType().getSqlTypeName()); + assertEquals( + java.nio.charset.StandardCharsets.UTF_8, singleCharNode.getType().getCharset()); + } + @Test public void testPrepareLambdaForBasicLambda() { when(componentType.getSqlTypeName()).thenReturn(SqlTypeName.DOUBLE); diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java b/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java index 0eee03102b..0676af35eb 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/SQLPlugin.java @@ -151,6 +151,19 @@ public class SQLPlugin extends Plugin private static final Logger LOGGER = LogManager.getLogger(SQLPlugin.class); + static { + // CalciteSystemProperty reads saffron.properties via its own classloader. In OpenSearch 3.x, + // Calcite is loaded by a parent/server classloader that cannot see resources bundled inside + // the plugin JAR, so saffron.properties is silently ignored. Setting the equivalent JVM + // system properties here (before any Calcite class is first used) is classloader-agnostic + // and produces the same effect as -Dcalcite.default.charset=UTF-8 in jvm.options, making + // non-ASCII PPL string literals (Chinese, Arabic, etc.) work without user configuration. + if (System.getProperty("calcite.default.charset") == null) { + System.setProperty("calcite.default.charset", "UTF-8"); + System.setProperty("calcite.default.collation.name", "UTF-8$en_US"); + } + } + private List executionEngineExtensions = List.of(); private ClusterService clusterService;