Commit 9a64d3bf authored by Phillip Webb's avatar Phillip Webb

Fix AsciiBytes unicode decoding

Fix the decoding logic in the AsciiBytes `hashCode` and `matches` to
correctly deal with multi-byte encodings.

Fixes gh-12504
parent 98a2a91d
...@@ -29,7 +29,9 @@ final class AsciiBytes { ...@@ -29,7 +29,9 @@ final class AsciiBytes {
private static final String EMPTY_STRING = ""; private static final String EMPTY_STRING = "";
private static final int[] EXCESS = { 0x0, 0x1080, 0x96, 0x1c82080 }; private static final int[] INITIAL_BYTE_BITMASK = { 0x7F, 0x1F, 0x0F, 0x07 };
private static final int SUBSEQUENT_BYTE_BITMASK = 0x3F;
private final byte[] bytes; private final byte[] bytes;
...@@ -142,13 +144,10 @@ final class AsciiBytes { ...@@ -142,13 +144,10 @@ final class AsciiBytes {
int totalLen = (nameLen + (suffix == 0 ? 0 : 1)); int totalLen = (nameLen + (suffix == 0 ? 0 : 1));
for (int i = this.offset; i < this.offset + this.length; i++) { for (int i = this.offset; i < this.offset + this.length; i++) {
int b = this.bytes[i]; int b = this.bytes[i];
if (b < 0) { int remainingUtfBytes = getNumberOfUtfBytes(b) - 1;
b = b & 0x7F; b &= INITIAL_BYTE_BITMASK[remainingUtfBytes];
int limit = getRemainingUtfBytes(b); for (int j = 0; j < remainingUtfBytes; j++) {
for (int j = 0; j < limit; j++) { b = (b << 6) + (this.bytes[++i] & SUBSEQUENT_BYTE_BITMASK);
b = (b << 6) + (this.bytes[++i] & 0xFF);
}
b -= EXCESS[limit];
} }
char c = getChar(name, suffix, charIndex++); char c = getChar(name, suffix, charIndex++);
if (b <= 0xFFFF) { if (b <= 0xFFFF) {
...@@ -185,13 +184,10 @@ final class AsciiBytes { ...@@ -185,13 +184,10 @@ final class AsciiBytes {
if (hash == 0 && this.bytes.length > 0) { if (hash == 0 && this.bytes.length > 0) {
for (int i = this.offset; i < this.offset + this.length; i++) { for (int i = this.offset; i < this.offset + this.length; i++) {
int b = this.bytes[i]; int b = this.bytes[i];
if (b < 0) { int remainingUtfBytes = getNumberOfUtfBytes(b) - 1;
b = b & 0x7F; b &= INITIAL_BYTE_BITMASK[remainingUtfBytes];
int limit = getRemainingUtfBytes(b); for (int j = 0; j < remainingUtfBytes; j++) {
for (int j = 0; j < limit; j++) { b = (b << 6) + (this.bytes[++i] & SUBSEQUENT_BYTE_BITMASK);
b = (b << 6) + (this.bytes[++i] & 0xFF);
}
b -= EXCESS[limit];
} }
if (b <= 0xFFFF) { if (b <= 0xFFFF) {
hash = 31 * hash + b; hash = 31 * hash + b;
...@@ -206,8 +202,16 @@ final class AsciiBytes { ...@@ -206,8 +202,16 @@ final class AsciiBytes {
return hash; return hash;
} }
private int getRemainingUtfBytes(int b) { private int getNumberOfUtfBytes(int b) {
return (b < 96 ? 1 : (b < 112 ? 2 : 3)); if ((b & 0x80) == 0) {
return 1;
}
int numberOfUtfBytes = 0;
while ((b & 0x80) != 0) {
b <<= 1;
numberOfUtfBytes++;
}
return numberOfUtfBytes;
} }
@Override @Override
......
/* /*
* Copyright 2012-2017 the original author or authors. * Copyright 2012-2018 the original author or authors.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
...@@ -184,6 +184,18 @@ public class AsciiBytesTests { ...@@ -184,6 +184,18 @@ public class AsciiBytesTests {
matchesSameAsString("\ud83d\udca9"); matchesSameAsString("\ud83d\udca9");
} }
@Test
public void hashCodeFromInstanceMatchesHashCodeFromString() {
String name = "fonts/宋体/simsun.ttf";
assertThat(new AsciiBytes(name).hashCode()).isEqualTo(AsciiBytes.hashCode(name));
}
@Test
public void instanceCreatedFromCharSequenceMatchesSameCharSequence() {
String name = "fonts/宋体/simsun.ttf";
assertThat(new AsciiBytes(name).matches(name, NO_SUFFIX)).isTrue();
}
private void matchesSameAsString(String input) { private void matchesSameAsString(String input) {
assertThat(new AsciiBytes(input).matches(input, NO_SUFFIX)).isTrue(); assertThat(new AsciiBytes(input).matches(input, NO_SUFFIX)).isTrue();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment