Skip to content

Commit 8c387dc

Browse files
authored
Merge pull request #37 from yumaoka/bug-props-escape
Fixes for Java properties space and other encoding problems
2 parents ec5e1a9 + bccb704 commit 8c387dc

8 files changed

Lines changed: 367 additions & 68 deletions

File tree

gp-res-filter/pom.xml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,6 @@
8383
<version>2.6.3</version>
8484
</dependency>
8585

86-
<!-- StringEscapeUtils -->
87-
<dependency>
88-
<groupId>org.apache.commons</groupId>
89-
<artifactId>commons-lang3</artifactId>
90-
<version>3.4</version>
91-
</dependency>
92-
9386
<!-- JUnit -->
9487
<dependency>
9588
<groupId>junit</groupId>

gp-res-filter/src/main/java/com/ibm/g11n/pipeline/resfilter/JavaPropertiesResource.java

Lines changed: 213 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
import java.io.OutputStreamWriter;
2424
import java.io.PrintWriter;
2525
import java.text.BreakIterator;
26-
import java.text.CharacterIterator;
27-
import java.text.StringCharacterIterator;
2826
import java.util.ArrayList;
2927
import java.util.Collections;
3028
import java.util.Enumeration;
@@ -40,8 +38,6 @@
4038

4139
import com.ibm.g11n.pipeline.resfilter.ResourceString.ResourceStringComparator;
4240

43-
import org.apache.commons.lang3.StringEscapeUtils;
44-
4541
/**
4642
* Java properties resource filter implementation.
4743
*
@@ -88,14 +84,14 @@ public Bundle parse(InputStream inStream) throws IOException {
8884
boolean globalNotesAvailable = true;
8985
List<String> globalNotes = null;
9086
while ((line = inStreamReader.readLine()) != null) {
91-
line = line.trim();
87+
line = stripLeadingSpaces(line);
9288
// Comment line - Add to list of comments (notes) until we find
9389
// either
9490
// a blank line (global comment) or a key/value pair
9591
if (line.startsWith("#") || line.startsWith("!")) {
9692
// Strip off the leading comment marker, and perform any
9793
// necessary unescaping here.
98-
currentNotes.add(StringEscapeUtils.unescapeJava(line.substring(1)));
94+
currentNotes.add(unescape(line.substring(1)));
9995
} else if (line.isEmpty()) {
10096
// We are following the convention that the first blank line in
10197
// a properties
@@ -118,7 +114,7 @@ public Bundle parse(InputStream inStream) throws IOException {
118114
sb.setLength(sb.length() - 1); // Remove the continuation
119115
// "\"
120116
if (continuationLine != null) {
121-
sb.append(continuationLine.trim());
117+
sb.append(stripLeadingSpaces(continuationLine));
122118
}
123119
}
124120
String logicalLine = sb.toString();
@@ -241,7 +237,7 @@ public static PropDef parseLine(String line) {
241237
}
242238

243239
String key = unescapePropKey(line.substring(0, sepIdx).trim());
244-
String value = unescapePropValue(line.substring(sepIdx + 1).trim());
240+
String value = unescapePropValue(stripLeadingSpaces(line.substring(sepIdx + 1)));
245241

246242
PropDef pl = new PropDef(key, value, sep);
247243
return pl;
@@ -300,14 +296,21 @@ public void print(PrintWriter pw, String language) throws IOException {
300296
int start = 0;
301297
int end = brk.next();
302298
boolean emitNext = false;
299+
boolean firstSegment = true;
303300
while (end != BreakIterator.DONE) {
304301
String segment = value.substring(start, end);
305-
String escSegment = escapePropValue(segment);
302+
String escSegment = null;
303+
if (firstSegment) {
304+
escSegment = escape(segment, EscapeSpace.LEADING_ONLY);
305+
firstSegment = false;
306+
} else {
307+
escSegment = escape(segment, EscapeSpace.NONE);
308+
}
306309
if (emitNext || (buf.length() + escSegment.length() + 2 >= COLMAX)) {
307310
// First character in a continuation line must be
308311
// a non-space character. Otherwise, keep appending
309312
// segments to the current line.
310-
if (!Character.isSpaceChar(escSegment.codePointAt(0))) {
313+
if (!isPropsWhiteSpaceChar(escSegment.charAt(0))) {
311314
// This segment is safe as the first word
312315
// of a continuation line.
313316
buf.append('\\');
@@ -356,56 +359,189 @@ public String toString() {
356359
}
357360
}
358361

359-
private static String escapePropValue(String s) {
360-
StringBuilder escaped = new StringBuilder();
361-
StringCharacterIterator itr = new StringCharacterIterator(s);
362-
for (char c = itr.first(); c != CharacterIterator.DONE; c = itr.next()) {
363-
if (c == '\\') {
364-
escaped.append("\\\\");
365-
} else if (c > 0x7F) {
366-
escaped.append("\\u").append(String.format("%04X", (int) c));
367-
} else if (c == ':') {
368-
escaped.append("\\:");
369-
} else if (c == '=') {
370-
escaped.append("\\:");
362+
private static final char BACKSLASH = '\\';
363+
364+
private enum EscapeSpace {
365+
ALL,
366+
LEADING_ONLY,
367+
NONE;
368+
}
369+
370+
private static String escape(String str, EscapeSpace escSpace) {
371+
StringBuilder buf = new StringBuilder();
372+
int idx = 0;
373+
374+
// Handle leading space characters
375+
if (escSpace == EscapeSpace.ALL || escSpace == EscapeSpace.LEADING_ONLY) {
376+
// Java properties specification considers the characters space (' ', '\u0020'),
377+
// tab ('\t', '\u0009'), and form feed ('\f', '\u000C') to be white space.
378+
//
379+
// java.util.Properties#store() implementation escapes space characters
380+
// to "\ " in key string, as well as leading spaces in value string.
381+
// Other white space characters are encoded by Unicode escape sequence.
382+
for (; idx < str.length(); idx++) {
383+
char c = str.charAt(idx);
384+
if (c == ' ') {
385+
buf.append(BACKSLASH).append(' ');
386+
} else if (c == '\t') {
387+
buf.append(BACKSLASH).append('t');
388+
} else if (c == '\f') {
389+
buf.append(BACKSLASH).append('f');
390+
} else {
391+
break;
392+
}
393+
}
394+
}
395+
396+
for (int i = idx; i < str.length(); i++) {
397+
char c = str.charAt(i);
398+
399+
if (c < 0x20 || c >= 0x7E) {
400+
// JDK API comment for Properties#store() specifies below:
401+
//
402+
// Characters less than \\u0020 and characters greater than \u007E in property keys
403+
// or values are written as \\uxxxx for the appropriate hexadecimal value xxxx.
404+
//
405+
// However, actual implementation uses "\t" for horizontal tab, "\n" for newline
406+
// and so on. This implementation support the equivalent behavior.
407+
switch (c) {
408+
case '\t':
409+
buf.append(BACKSLASH).append('t');
410+
break;
411+
case '\n':
412+
buf.append(BACKSLASH).append('n');
413+
break;
414+
case '\f':
415+
buf.append(BACKSLASH).append('f');
416+
break;
417+
case '\r':
418+
buf.append(BACKSLASH).append('r');
419+
break;
420+
default:
421+
appendUnicodeEscape(buf, c);
422+
break;
423+
}
371424
} else {
372-
escaped.append(c);
425+
switch (c) {
426+
case ' ': // space
427+
if (escSpace == EscapeSpace.ALL) {
428+
buf.append(BACKSLASH).append(c);
429+
} else {
430+
buf.append(c);
431+
}
432+
break;
433+
434+
// The key and element characters #, !, =, and : are written with
435+
// a preceding backslash
436+
case '#':
437+
case '!':
438+
case '=':
439+
case ':':
440+
case '\\':
441+
buf.append(BACKSLASH).append(c);
442+
break;
443+
444+
default:
445+
buf.append(c);
446+
break;
447+
}
373448
}
374449
}
375-
return escaped.toString();
450+
451+
return buf.toString();
452+
}
453+
454+
static String escapePropKey(String str) {
455+
return escape(str, EscapeSpace.ALL);
456+
}
457+
458+
static String escapePropValue(String str) {
459+
return escape(str, EscapeSpace.LEADING_ONLY);
460+
}
461+
462+
static void appendUnicodeEscape(StringBuilder buf, char codeUnit) {
463+
buf.append(BACKSLASH).append('u')
464+
.append(String.format("%04X", (int)codeUnit));
376465
}
377466

378-
private static String unescapePropValue(String s) {
379-
StringBuilder unescaped = new StringBuilder();
380-
StringCharacterIterator itr = new StringCharacterIterator(s);
381-
for (char c = itr.first(); c != CharacterIterator.DONE; c = itr.next()) {
382-
if (c == '\\' && itr.getIndex() < itr.getEndIndex()) {
383-
char n = itr.next();
384-
if (n == '\\' || n == ':' || n == '=') {
385-
unescaped.append(n);
386-
} else if (n == 'u' && itr.getIndex() + 4 <= itr.getEndIndex()) {
387-
StringBuilder unicodeEscape = new StringBuilder("\\u");
388-
for (int i = 0; i < 4; i++) {
389-
unicodeEscape.append(itr.next());
467+
static String unescapePropKey(String str) {
468+
return unescape(str);
469+
}
470+
471+
static String unescapePropValue(String str) {
472+
return unescape(str);
473+
}
474+
475+
private static String unescape(String str) {
476+
StringBuilder buf = new StringBuilder();
477+
boolean isEscSeq = false;
478+
for (int i = 0; i < str.length(); i++) {
479+
char c = str.charAt(i);
480+
if (isEscSeq) {
481+
switch (c) {
482+
case 't':
483+
buf.append('\t');
484+
break;
485+
486+
case 'n':
487+
buf.append('\n');
488+
break;
489+
490+
case 'f':
491+
buf.append('\f');
492+
break;
493+
494+
case 'r':
495+
buf.append('\r');
496+
break;
497+
498+
case 'u':
499+
{
500+
// This implementation throws an IllegalArgumentException
501+
// when the input string contains a malformed Unicode escape
502+
// character sequence. This behavior matches java.util.Properties#load(Reader).
503+
final String errMsg = "Malformed \\uxxxx encoding.";
504+
if (i + 4 > str.length()) {
505+
throw new IllegalArgumentException(errMsg);
390506
}
391-
unescaped.append(StringEscapeUtils.unescapeJava(unicodeEscape.toString()));
392-
} else {
393-
unescaped.append(c);
394-
unescaped.append(n);
507+
// Parse hex digits
508+
String hexDigits = str.substring(i + 1, i + 5);
509+
try {
510+
char codeUnit = (char)Integer.parseInt(hexDigits, 16);
511+
buf.append(Character.valueOf(codeUnit));
512+
i += 4;
513+
} catch (NumberFormatException e) {
514+
throw new IllegalArgumentException(errMsg, e);
515+
}
516+
break;
395517
}
518+
519+
default:
520+
// Special rules applied to Java properties format
521+
// beyond standard Java escape character sequence.
522+
//
523+
// 1. Octal escapes are not recognized
524+
// 2. \b does not represent a backspace character
525+
// 3. Backslash is dropped from unrecognized escape sequence.
526+
// For example, "\z" is interpreted as a single character 'z'.
527+
528+
buf.append(c);
529+
break;
530+
}
531+
isEscSeq = false;
396532
} else {
397-
unescaped.append(c);
533+
if (c == BACKSLASH) {
534+
isEscSeq = true;
535+
} else {
536+
buf.append(c);
537+
}
398538
}
399539
}
400-
return unescaped.toString();
401-
}
402540

403-
private static String escapePropKey(String s) {
404-
return s.replace(" ", "\\ ");
405-
}
541+
// Note: Incomplete escape sequence should not be there.
542+
// This implementation silently drop the character for the case.
406543

407-
private static String unescapePropKey(String s) {
408-
return s.replaceAll("\\\\ ", " ");
544+
return buf.toString();
409545
}
410546

411547
@Override
@@ -435,7 +571,7 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
435571
logicalLine = logicalLineBuf.toString();
436572
}
437573
} else {
438-
String normLine = line.trim();
574+
String normLine = stripLeadingSpaces(line);
439575

440576
if (orgLines.isEmpty()) {
441577
// No continuation marker in the previous line
@@ -467,6 +603,13 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
467603
if (logicalLine != null) {
468604
PropDef pd = PropDef.parseLine(logicalLine);
469605
if (pd != null && resMap.containsKey(pd.getKey())) {
606+
// Preserve original leading spaces
607+
String firstLine = orgLines.isEmpty() ? line : orgLines.get(0);
608+
int len = getLeadingSpacesLength(firstLine);
609+
if (len > 0) {
610+
outWriter.print(firstLine.substring(0, len));
611+
}
612+
// Write the property key and value
470613
String key = pd.getKey();
471614
PropDef modPd = new PropDef(key, resMap.get(key), pd.getSeparator());
472615
modPd.print(outWriter, language);
@@ -490,4 +633,25 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
490633

491634
outWriter.flush();
492635
}
636+
637+
private static int getLeadingSpacesLength(String s) {
638+
int idx = 0;
639+
for (; idx < s.length(); idx++) {
640+
if (!isPropsWhiteSpaceChar(s.charAt(idx))) {
641+
break;
642+
}
643+
}
644+
return idx;
645+
}
646+
647+
private static String stripLeadingSpaces(String s) {
648+
return s.substring(getLeadingSpacesLength(s));
649+
}
650+
651+
private static boolean isPropsWhiteSpaceChar(char c) {
652+
// Java properties specification considers the characters space (' ', '\u0020'),
653+
// tab ('\t', '\u0009'), and form feed ('\f', '\u000C') to be white space.
654+
655+
return c == ' ' || c == '\t' || c == '\f';
656+
}
493657
}

0 commit comments

Comments
 (0)