Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix issues with accents and the word splitter #1330

Merged
merged 3 commits into from
Jun 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion integration-tests/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
{
"path": "django/django",
"url": "https://github.com/django/django.git",
"commit": "ecf8af79355c8daa67722bd0de946b351f7f613d",
"commit": "8b4983cfd429e17c8092f4ff775915327effa6fa",
"args": [
"**/*.{md,py}"
]
Expand Down
97 changes: 55 additions & 42 deletions integration-tests/snapshots/django/django/snapshot.txt

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions packages/cspell-lib/src/util/text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,21 @@ describe('Util Text', () => {
).toEqual(['Γ', 'γ', 'gamma', 'γάμμα']);
});

test.each`
text | expected
${'hello'} | ${['hello']}
${nfc('café')} | ${[nfc('café')]}
${nfd('café')} | ${[nfd('café')]}
${nfd('caféStyle')} | ${[nfd('café'), 'Style']}
${nfc('caféÁ')} | ${[nfc('café'), nfc('Á')]}
${nfd('caféÁ')} | ${[nfd('café'), nfd('Á')]}
`('extractWordsFromCode "$text"', ({ text, expected }) => {
const r = Text.extractWordsFromCode(text)
.map((wo) => wo.text)
.toArray();
expect(r).toEqual(expected);
});

test('case of Chinese characters', () => {
expect(Text.isUpperCase('携程旅行网')).toBe(false);
expect(Text.isLowerCase('携程旅行网')).toBe(false);
Expand Down Expand Up @@ -349,6 +364,14 @@ describe('Validates offset conversions', () => {
});
});

function nfc(s: string): string {
return s.normalize('NFC');
}

function nfd(s: string): string {
return s.normalize('NFD');
}

function match(regexp: RegExp, text: string): (string | number)[] {
const x = Text.matchStringToTextOffset(regexp, text)
.concatMap((t) => [t.text, t.offset])
Expand Down
2 changes: 1 addition & 1 deletion packages/cspell-lib/src/util/text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export function cleanText(text: string): string {

export function cleanTextOffset(text: TextOffset): TextOffset {
return {
text: text.text.replace(regExIgnoreCharacters, (match: string) => ' '.repeat(match.length)),
text: cleanText(text.text),
offset: text.offset,
};
}
Expand Down
139 changes: 139 additions & 0 deletions packages/cspell-lib/src/util/textRegex.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import {
regExAccents,
regExAllLower,
regExAllUpper,
regExDanglingQuote,
regExFirstUpper,
regExSplitWords,
regExSplitWords2,
regExTrailingEndings,
} from './textRegex';

describe('Validate textRegex', () => {
// cspell:ignore CODE'ing
test.each`
text | expected
${'hello'} | ${[]}
${'CODEing'} | ${[['ing']]}
${"CODE'ing"} | ${[["'ing"]]}
${"ERROR'd"} | ${[["'d"]]}
${"ERROR's"} | ${[["'s"]]}
${'ERRORs'} | ${[['s']]}
${'ERRORes'} | ${[['es']]}
${'ERRORth'} | ${[['th']]}
${'ERRORnth'} | ${[['nth']]}
${'ERRORies'} | ${[['ies']]}
${nfc('CAFÉed')} | ${[['ed']]}
${nfd('CAFÉed')} | ${[['ed']]}
${nfd('CAFÉ’ed')} | ${[['’ed']]}
${nfd('CAFÉ’s')} | ${[['’s']]}
`('regExTrailingEndings on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExTrailingEndings)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${"ERROR's"} | ${[]}
${"'thing"} | ${["'"]}
${"n'cpp"} | ${["'"]}
${"s'thing"} | ${["'"]}
${"A'thing"} | ${["'"]}
${"s 'thing"} | ${["'"]}
${nfc(`é'thing`)} | ${["'"]}
${nfd(`é'thing`)} | ${["'"]}
`('regExDanglingQuote on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExDanglingQuote) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${"ERROR's"} | ${[]}
${nfc(`é'thing`)} | ${[]}
${nfd(`é'thing`)} | ${[nfd('á').replace('a', '')]}
`('regExAccents on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAccents) ?? [];
expect([...m]).toEqual(expected);
});

// cspell:word érror
test.each`
text | expected
${'hello'} | ${[]}
${'ERROR'} | ${['ERROR']}
${'ERRORs'} | ${[]}
${nfc(`érror`).toUpperCase()} | ${[nfc('ÉRROR')]}
${nfd(`érror`).toUpperCase()} | ${[nfd('ÉRROR')]}
`('regExAllUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAllUpper) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${['hello']}
${'ERROR'} | ${[]}
${'Errors'} | ${[]}
${nfc(`érror`)} | ${[nfc('érror')]}
${nfd(`érror`)} | ${[nfd('érror')]}
${nfc(`érror`)} | ${[nfc('érror')]}
${nfc(`café`)} | ${[nfc('café')]}
${nfd(`café`)} | ${[nfd('café')]}
`('regExAllLower on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExAllLower) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'ERROR'} | ${[]}
${'Errors'} | ${['Errors']}
${nfc(`Érror`)} | ${[nfc('Érror')]}
${nfd(`Érror`)} | ${[nfd('Érror')]}
`('regExFirstUpper on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = text.match(regExFirstUpper) ?? [];
expect([...m]).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'errorCode'} | ${[['rC', 'r', 'C']]}
${nfc('caféStyle')} | ${[[nfc('éS'), nfc('é'), 'S']]}
${nfd('caféStyle')} | ${[[nfd('éS'), nfd('é'), 'S']]}
${'Errors'} | ${[]}
`('regExSplitWords on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExSplitWords)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});

test.each`
text | expected
${'hello'} | ${[]}
${'ERRORCode'} | ${[['RCo', 'R', 'Co']]}
${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St']]}
${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St']]}
${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér')]]}
${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér')]]}
${'ERRORS'} | ${[]}
`('regExSplitWords2 on "$text"', ({ text, expected }: { text: string; expected: string[] }) => {
const m = [...text.matchAll(regExSplitWords2)].map((m) => Array.from(m));
expect(m).toEqual(expected);
});
});

// function s(t: string, on: string | RegExp = '|'): string[] {
// return t.split(on);
// }

function nfc(s: string): string {
return s.normalize('NFC');
}

function nfd(s: string): string {
return s.normalize('NFD');
}
11 changes: 6 additions & 5 deletions packages/cspell-lib/src/util/textRegex.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
// cspell:ignore ings ning gimuy anrvtbf

export const regExLines = /.*(\r?\n|$)/g;
export const regExUpperSOrIng = /(\p{Lu}+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
export const regExUpperSOrIng = /([\p{Lu}\p{M}]+\\?['’]?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu;
export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu;
export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu;
export const regExWords = /\p{L}(?:(?:\\?['’])?\p{L})*/gu;
export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu;
export const regExWordsAndDigits = /(?:\d+)?[\p{L}\p{M}_'’-](?:(?:\\?['’])?[\p{L}\p{M}\w'’-])*/gu;
export const regExIgnoreCharacters = /\p{sc=Hiragana}|\p{sc=Han}|\p{sc=Katakana}|[\u30A0-\u30FF]|[\p{sc=Hangul}]/gu;
export const regExIgnoreCharacters = /[\p{sc=Hiragana}\p{sc=Han}\p{sc=Katakana}\u30A0-\u30FF\p{sc=Hangul}]/gu;
export const regExFirstUpper = /^\p{Lu}\p{M}?\p{Ll}+$/u;
export const regExAllUpper = /^(?:\p{Lu}\p{M}?)+$/u;
export const regExAllLower = /^(?:\p{Ll}\p{M}?)+$/u;
export const regExPossibleWordBreaks = /[_-]/g;
export const regExMatchRegExParts = /^\/(.*)\/([gimuy]*)$/;
export const regExAccents = /\p{M}/gu;
export const regExEscapeCharacters = /(?<=\\)[anrvtbf]/gi;
export const regExDanglingQuote = /(?<=\P{L}\p{L}?)[']/gu;
/** Matches against leading `'` or `{single letter}'` */
export const regExDanglingQuote = /(?<=(?:^|(?!\p{M})\P{L})(?:\p{L}\p{M}?)?)[']/gu;
/** Match tailing endings after CAPS words */
export const regExTrailingEndings = /(?<=\p{Lu}{2})['’]?(?:s|d|ing[s]|ies|e[ds]|ning|th|nth)(?!\p{Ll})/gu;
export const regExTrailingEndings = /(?<=(?:\p{Lu}\p{M}?){2})['’]?(?:s|d|ings?|ies|e[ds]?|ning|th|nth)(?!\p{Ll})/gu;
45 changes: 32 additions & 13 deletions packages/cspell-lib/src/util/wordSplitter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ describe('Validate wordSplitter', () => {
}

// cspell:ignore CVTPD CVTSI CVTTSD words'separated'by errorcode
// cspell:word Geschäft gescha
test.each`
text | expectedWords
${'hello'} | ${[tov({ text: 'hello', offset: 155 })]}
Expand All @@ -124,6 +125,7 @@ describe('Validate wordSplitter', () => {
${'_errorcode42_one_two'} | ${splitTov('_errorcode42|one|two')}
${"words'separated'by_singleQuote"} | ${splitTov(`words'separated'by|singleQuote`)}
${"Tom's_hardware"} | ${splitTov("Tom's|hardware")}
${'Geschäft'} | ${splitTov('Geschäft')}
`('split $text', ({ text, expectedWords }: TestSplit) => {
const prefix = 'this is some';
const line = {
Expand Down Expand Up @@ -186,18 +188,22 @@ describe('Validate wordSplitter', () => {

// cspell:ignore nstatic techo n'cpp n'log refactor'd
test.each`
text | expectedWords | calls
${'static'} | ${'static'} | ${1}
${'nstatic'} | ${'static'} | ${1}
${'techo'} | ${'echo'} | ${1}
${`n'cpp`} | ${'cpp'} | ${1}
${`n'log`} | ${'log'} | ${7}
${'64-bit'} | ${'bit'} | ${1}
${'128-bit'} | ${'bit'} | ${1}
${'256-sha'} | ${'256-sha'} | ${6}
${`REFACTOR'd`} | ${'REFACTOR'} | ${2}
${`dogs'`} | ${`dogs'`} | ${2}
${`planets’`} | ${`planets’`} | ${2}
text | expectedWords | calls
${'static'} | ${'static'} | ${1}
${'nstatic'} | ${'static'} | ${1}
${'techo'} | ${'echo'} | ${1}
${`n'cpp`} | ${'cpp'} | ${1}
${`î'cpp`} | ${'î|cpp'} | ${2}
${`îphoneStatic`} | ${'îphone|Static'} | ${2}
${`êphoneStatic`} | ${'êphone|Static'} | ${2}
${`geschäft`} | ${'geschäft'} | ${1}
${`n'log`} | ${'log'} | ${7}
${'64-bit'} | ${'bit'} | ${1}
${'128-bit'} | ${'bit'} | ${1}
${'256-sha'} | ${'256-sha'} | ${6}
${`REFACTOR'd`} | ${'REFACTOR'} | ${2}
${`dogs'`} | ${`dogs'`} | ${2}
${`planets’`} | ${`planets’`} | ${2}
`('split `$text` in doc', ({ text, expectedWords, calls }: TestSplit2) => {
const expectedWordSegments = splitTov(expectedWords);
const doc = sampleText();
Expand All @@ -220,7 +226,8 @@ describe('Validate wordSplitter', () => {
});

function has({ text }: TextOffset): boolean {
return text.length < 3 || !regHasLetters.test(text) || words.has(text) || words.has(text.toLowerCase());
const nfcText = text.normalize('NFC');
return text.length < 3 || !regHasLetters.test(text) || words.has(nfcText) || words.has(nfcText.toLowerCase());
}

function applyWordBreaks(text: TextOffset, breaks: number[]): TextOffset[] {
Expand Down Expand Up @@ -328,6 +335,9 @@ function sampleWordSet() {
CVTPD2PS
CVTTSD
echo
îphone
êphone
Geschäft
error codes
hello
MOVSX_r_rm16
Expand Down Expand Up @@ -373,5 +383,14 @@ function sampleText() {

128-bit values

î'cpp
îphoneStatic

geschäft

êphoneStatic

`;
}

// cspell:ignore êphone îphone geschäft