Skip to content

Commit 2e6b6b6

Browse files
author
Yoshito Umaoka
committed
Support leading/trailing spaces and java character escape sequence properly in properties file
Fixed #35, some problems for handling leading/trailing spaces in properties keys/values. - Trailing spaces in a property line are now interpreted as a part of value properly. - Leading spaces in a property value are now encoded properly. - Preserves leading whitespace characters in each line in merge method implementation. Fixed #36, Java character escape sequence issues. - Supporting the JDK Properties API specification for escaping/unescaping property keys/values. Because the Properties class's escape/unescape spec differs from the standard Java character escape, the use of Apache Commons StringEscapeUtils#escapeJava()/#unescapeJava() was replaced with our own implementation.
1 parent 0f82383 commit 2e6b6b6

8 files changed

Lines changed: 272 additions & 68 deletions

File tree

gp-res-filter/pom.xml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,6 @@
8383
<version>2.6.3</version>
8484
</dependency>
8585

86-
<!-- StringEscapeUtils -->
87-
<dependency>
88-
<groupId>org.apache.commons</groupId>
89-
<artifactId>commons-lang3</artifactId>
90-
<version>3.4</version>
91-
</dependency>
92-
9386
<!-- JUnit -->
9487
<dependency>
9588
<groupId>junit</groupId>

gp-res-filter/src/main/java/com/ibm/g11n/pipeline/resfilter/JavaPropertiesResource.java

Lines changed: 211 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
import java.io.OutputStreamWriter;
2424
import java.io.PrintWriter;
2525
import java.text.BreakIterator;
26-
import java.text.CharacterIterator;
27-
import java.text.StringCharacterIterator;
2826
import java.util.ArrayList;
2927
import java.util.Collections;
3028
import java.util.Enumeration;
@@ -40,8 +38,6 @@
4038

4139
import com.ibm.g11n.pipeline.resfilter.ResourceString.ResourceStringComparator;
4240

43-
import org.apache.commons.lang3.StringEscapeUtils;
44-
4541
/**
4642
* Java properties resource filter implementation.
4743
*
@@ -88,14 +84,14 @@ public Bundle parse(InputStream inStream) throws IOException {
8884
boolean globalNotesAvailable = true;
8985
List<String> globalNotes = null;
9086
while ((line = inStreamReader.readLine()) != null) {
91-
line = line.trim();
87+
line = stripLeadingSpaces(line);
9288
// Comment line - Add to list of comments (notes) until we find
9389
// either
9490
// a blank line (global comment) or a key/value pair
9591
if (line.startsWith("#") || line.startsWith("!")) {
9692
// Strip off the leading comment marker, and perform any
9793
// necessary unescaping here.
98-
currentNotes.add(StringEscapeUtils.unescapeJava(line.substring(1)));
94+
currentNotes.add(unescape(line.substring(1)));
9995
} else if (line.isEmpty()) {
10096
// We are following the convention that the first blank line in
10197
// a properties
@@ -118,7 +114,7 @@ public Bundle parse(InputStream inStream) throws IOException {
118114
sb.setLength(sb.length() - 1); // Remove the continuation
119115
// "\"
120116
if (continuationLine != null) {
121-
sb.append(continuationLine.trim());
117+
sb.append(stripLeadingSpaces(continuationLine));
122118
}
123119
}
124120
String logicalLine = sb.toString();
@@ -241,7 +237,7 @@ public static PropDef parseLine(String line) {
241237
}
242238

243239
String key = unescapePropKey(line.substring(0, sepIdx).trim());
244-
String value = unescapePropValue(line.substring(sepIdx + 1).trim());
240+
String value = unescapePropValue(stripLeadingSpaces(line.substring(sepIdx + 1)));
245241

246242
PropDef pl = new PropDef(key, value, sep);
247243
return pl;
@@ -300,14 +296,21 @@ public void print(PrintWriter pw, String language) throws IOException {
300296
int start = 0;
301297
int end = brk.next();
302298
boolean emitNext = false;
299+
boolean firstSegment = true;
303300
while (end != BreakIterator.DONE) {
304301
String segment = value.substring(start, end);
305-
String escSegment = escapePropValue(segment);
302+
String escSegment = null;
303+
if (firstSegment) {
304+
escSegment = escape(segment, EscapeSpace.LEADING_ONLY);
305+
firstSegment = false;
306+
} else {
307+
escSegment = escape(segment, EscapeSpace.NONE);
308+
}
306309
if (emitNext || (buf.length() + escSegment.length() + 2 >= COLMAX)) {
307310
// First character in a continuation line must be
308311
// a non-space character. Otherwise, keep appending
309312
// segments to the current line.
310-
if (!Character.isSpaceChar(escSegment.codePointAt(0))) {
313+
if (!isPropsWhiteSpaceChar(escSegment.charAt(0))) {
311314
// This segment is safe as the first word
312315
// of a continuation line.
313316
buf.append('\\');
@@ -356,56 +359,187 @@ public String toString() {
356359
}
357360
}
358361

359-
private static String escapePropValue(String s) {
360-
StringBuilder escaped = new StringBuilder();
361-
StringCharacterIterator itr = new StringCharacterIterator(s);
362-
for (char c = itr.first(); c != CharacterIterator.DONE; c = itr.next()) {
363-
if (c == '\\') {
364-
escaped.append("\\\\");
365-
} else if (c > 0x7F) {
366-
escaped.append("\\u").append(String.format("%04X", (int) c));
367-
} else if (c == ':') {
368-
escaped.append("\\:");
369-
} else if (c == '=') {
370-
escaped.append("\\:");
362+
private static final char BACKSLASH = '\\';
363+
364+
private enum EscapeSpace {
365+
ALL,
366+
LEADING_ONLY,
367+
NONE;
368+
}
369+
370+
private static String escape(String str, EscapeSpace escSpace) {
371+
StringBuilder buf = new StringBuilder();
372+
int idx = 0;
373+
374+
// Handle leading space characters
375+
if (escSpace == EscapeSpace.ALL || escSpace == EscapeSpace.LEADING_ONLY) {
376+
// Java properties specification considers the characters space (' ', '\u0020'),
377+
// tab ('\t', '\u0009'), and form feed ('\f', '\u000C') to be white space.
378+
//
379+
// java.util.Properties#store() implementation escapes space characters
380+
// to "\ " in key string, as well as leading spaces in value string.
381+
// Other white space characters are encoded by Unicode escape sequence.
382+
for (; idx < str.length(); idx++) {
383+
char c = str.charAt(idx);
384+
if (c == ' ') {
385+
buf.append(BACKSLASH).append(' ');
386+
} else if (c == '\t' || c == '\f') {
387+
appendUnicodeEscape(buf, c);
388+
} else {
389+
break;
390+
}
391+
}
392+
}
393+
394+
for (int i = idx; i < str.length(); i++) {
395+
char c = str.charAt(i);
396+
397+
if (c < 0x20 || c >= 0x7E) {
398+
// JDK API comment for Properties#store() specifies below:
399+
//
400+
// Characters less than \\u0020 and characters greater than \u007E in property keys
401+
// or values are written as \\uxxxx for the appropriate hexadecimal value xxxx.
402+
//
403+
// However, actual implementation uses "\t" for horizontal tab, "\n" for newline
404+
// and so on. This implementation support the equivalent behavior.
405+
switch (c) {
406+
case '\t':
407+
buf.append(BACKSLASH).append('t');
408+
break;
409+
case '\n':
410+
buf.append(BACKSLASH).append('n');
411+
break;
412+
case '\f':
413+
buf.append(BACKSLASH).append('f');
414+
break;
415+
case '\r':
416+
buf.append(BACKSLASH).append('r');
417+
break;
418+
default:
419+
appendUnicodeEscape(buf, c);
420+
break;
421+
}
371422
} else {
372-
escaped.append(c);
423+
switch (c) {
424+
case ' ': // space
425+
if (escSpace == EscapeSpace.ALL) {
426+
buf.append(BACKSLASH).append(c);
427+
} else {
428+
buf.append(c);
429+
}
430+
break;
431+
432+
// The key and element characters #, !, =, and : are written with
433+
// a preceding backslash
434+
case '#':
435+
case '!':
436+
case '=':
437+
case ':':
438+
case '\\':
439+
buf.append(BACKSLASH).append(c);
440+
break;
441+
442+
default:
443+
buf.append(c);
444+
break;
445+
}
373446
}
374447
}
375-
return escaped.toString();
448+
449+
return buf.toString();
450+
}
451+
452+
static String escapePropKey(String str) {
453+
return escape(str, EscapeSpace.ALL);
454+
}
455+
456+
static String escapePropValue(String str) {
457+
return escape(str, EscapeSpace.LEADING_ONLY);
458+
}
459+
460+
static void appendUnicodeEscape(StringBuilder buf, char codeUnit) {
461+
buf.append(BACKSLASH).append('u')
462+
.append(String.format("%04X", (int)codeUnit));
376463
}
377464

378-
private static String unescapePropValue(String s) {
379-
StringBuilder unescaped = new StringBuilder();
380-
StringCharacterIterator itr = new StringCharacterIterator(s);
381-
for (char c = itr.first(); c != CharacterIterator.DONE; c = itr.next()) {
382-
if (c == '\\' && itr.getIndex() < itr.getEndIndex()) {
383-
char n = itr.next();
384-
if (n == '\\' || n == ':' || n == '=') {
385-
unescaped.append(n);
386-
} else if (n == 'u' && itr.getIndex() + 4 <= itr.getEndIndex()) {
387-
StringBuilder unicodeEscape = new StringBuilder("\\u");
388-
for (int i = 0; i < 4; i++) {
389-
unicodeEscape.append(itr.next());
465+
static String unescapePropKey(String str) {
466+
return unescape(str);
467+
}
468+
469+
static String unescapePropValue(String str) {
470+
return unescape(str);
471+
}
472+
473+
private static String unescape(String str) {
474+
StringBuilder buf = new StringBuilder();
475+
boolean isEscSeq = false;
476+
for (int i = 0; i < str.length(); i++) {
477+
char c = str.charAt(i);
478+
if (isEscSeq) {
479+
switch (c) {
480+
case 't':
481+
buf.append('\t');
482+
break;
483+
484+
case 'n':
485+
buf.append('\n');
486+
break;
487+
488+
case 'f':
489+
buf.append('\f');
490+
break;
491+
492+
case 'r':
493+
buf.append('\r');
494+
break;
495+
496+
case 'u':
497+
{
498+
// This implementation throws an IllegalArgumentException
499+
// when the input string contains a malformed Unicode escape
500+
// character sequence. This behavior matches java.util.Properties#load(Reader).
501+
final String errMsg = "Malformed \\uxxxx encoding.";
502+
if (i + 4 > str.length()) {
503+
throw new IllegalArgumentException(errMsg);
390504
}
391-
unescaped.append(StringEscapeUtils.unescapeJava(unicodeEscape.toString()));
392-
} else {
393-
unescaped.append(c);
394-
unescaped.append(n);
505+
// Parse hex digits
506+
String hexDigits = str.substring(i + 1, i + 5);
507+
try {
508+
char codeUnit = (char)Integer.parseInt(hexDigits, 16);
509+
buf.append(Character.valueOf(codeUnit));
510+
i += 4;
511+
} catch (NumberFormatException e) {
512+
throw new IllegalArgumentException(errMsg, e);
513+
}
514+
break;
395515
}
516+
517+
default:
518+
// Special rules applied to Java properties format
519+
// beyond standard Java escape character sequence.
520+
//
521+
// 1. Octal escapes are not recognized
522+
// 2. \b does not represent a backspace character
523+
// 3. Backslash is dropped from unrecognized escape sequence.
524+
// For example, "\z" is interpreted as a single character 'z'.
525+
526+
buf.append(c);
527+
break;
528+
}
529+
isEscSeq = false;
396530
} else {
397-
unescaped.append(c);
531+
if (c == BACKSLASH) {
532+
isEscSeq = true;
533+
} else {
534+
buf.append(c);
535+
}
398536
}
399537
}
400-
return unescaped.toString();
401-
}
402538

403-
private static String escapePropKey(String s) {
404-
return s.replace(" ", "\\ ");
405-
}
539+
// Note: Incomplete escape sequence should not be there.
540+
// This implementation silently drop the character for the case.
406541

407-
private static String unescapePropKey(String s) {
408-
return s.replaceAll("\\\\ ", " ");
542+
return buf.toString();
409543
}
410544

411545
@Override
@@ -435,7 +569,7 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
435569
logicalLine = logicalLineBuf.toString();
436570
}
437571
} else {
438-
String normLine = line.trim();
572+
String normLine = stripLeadingSpaces(line);
439573

440574
if (orgLines.isEmpty()) {
441575
// No continuation marker in the previous line
@@ -467,6 +601,13 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
467601
if (logicalLine != null) {
468602
PropDef pd = PropDef.parseLine(logicalLine);
469603
if (pd != null && resMap.containsKey(pd.getKey())) {
604+
// Preserve original leading spaces
605+
String firstLine = orgLines.isEmpty() ? line : orgLines.get(0);
606+
int len = getLeadingSpacesLength(firstLine);
607+
if (len > 0) {
608+
outWriter.print(firstLine.substring(0, len));
609+
}
610+
// Write the property key and value
470611
String key = pd.getKey();
471612
PropDef modPd = new PropDef(key, resMap.get(key), pd.getSeparator());
472613
modPd.print(outWriter, language);
@@ -490,4 +631,25 @@ public void merge(InputStream base, OutputStream outStream, String language, Bun
490631

491632
outWriter.flush();
492633
}
634+
635+
private static int getLeadingSpacesLength(String s) {
636+
int idx = 0;
637+
for (; idx < s.length(); idx++) {
638+
if (!isPropsWhiteSpaceChar(s.charAt(idx))) {
639+
break;
640+
}
641+
}
642+
return idx;
643+
}
644+
645+
private static String stripLeadingSpaces(String s) {
646+
return s.substring(getLeadingSpacesLength(s));
647+
}
648+
649+
private static boolean isPropsWhiteSpaceChar(char c) {
650+
// Java properties specification considers the characters space (' ', '\u0020'),
651+
// tab ('\t', '\u0009'), and form feed ('\f', '\u000C') to be white space.
652+
653+
return c == ' ' || c == '\t' || c == '\f';
654+
}
493655
}

0 commit comments

Comments
 (0)