Key Phrases Extraction (U-SQL)

Summary

The KeyPhraseExtractor cognitive function returns a list of “;” separated strings denoting the key talking points in the input text. This cognitive function employ techniques from Microsoft Office's sophisticated Natural Language Processing toolkit and supports English text.

Arguments TBD

KeyPhraseExtractor(
string TBD = "TBD", string outCol = "KeyPhrase")

Examples

Books
The examples utilize two books: War and Peace, and Peter Pan.

  • war_and_peace.csv is installed automatically when you install the cognitive assemblies. The file is located at /usqlext/samples/cognition/war_and_peace.csv.
  • PeterPan.txt was obtained from https://www.gutenberg.org/files/16/16-0.txt.

Extract Text

// War and Peace
@book =
    EXTRACT No int,
            Year string,
            Book string,
            Chapter string,
            Text string
    FROM @"/Samples/Books/war_and_peace.csv"
    USING Extractors.Csv();

// Peter Pan
@otherBook =
    EXTRACT Text string
    FROM @"/Samples/Books/PeterPan.txt"
    USING Extractors.Text(silent: true, delimiter: '`');

Extract Key Phrases

REFERENCE ASSEMBLY [TextKeyPhrase];

// War and Peace
@keyphrase =
    PROCESS @book
    PRODUCE No,
            Year,
            Book,
            Chapter,
            Text,
            KeyPhrase string
    READONLY No,
            Year,
            Book,
            Chapter,
            Text
    USING new Cognition.Text.KeyPhraseExtractor();

// Peter Pan
@otherKeyPhrases =
    PROCESS @otherBook
    PRODUCE Text,
            KeyPhrase string
    READONLY Text
    USING new Cognition.Text.KeyPhraseExtractor();

OUTPUT @keyphrase
TO "/ReferenceGuide/Cognition/Text/KeyPhraseExtractor1A.txt"
USING Outputters.Tsv(outputHeader: true);

OUTPUT @otherKeyPhrases
TO "/ReferenceGuide/Cognition/Text/KeyPhraseExtractor1B.txt"
USING Outputters.Tsv(outputHeader: true);

Tokenize the key phrases

// War and Peace
@KPsplits =
    SELECT No,
           Year,
           Book,
           Chapter,
           Text,
           T.KeyPhrase
    FROM @keyphrase
         CROSS APPLY EXPLODE (KeyPhrase.Split(';')) AS T(KeyPhrase);

// Peter Pan
@OtherKPsplits =
    SELECT Text,
           T.KeyPhrase 
    FROM @otherKeyPhrases
         CROSS APPLY EXPLODE (KeyPhrase.Split(';')) AS T(KeyPhrase);

OUTPUT @KPsplits
TO "/ReferenceGuide/Cognition/Text/KeyPhraseExtractor2A.txt"
USING Outputters.Tsv(outputHeader: true);

OUTPUT @OtherKPsplits
TO "/ReferenceGuide/Cognition/Text/KeyPhraseExtractor2B.txt"
USING Outputters.Tsv(outputHeader: true);

Some queries against @KPsplits from above example

// Top 5 key phrases overall
@wordCount = 
    SELECT KeyPhrase, COUNT(KeyPhrase) AS wordCount
    FROM @KPsplits
    WHERE NOT string.IsNullOrEmpty(KeyPhrase)
    GROUP BY KeyPhrase
    ORDER BY COUNT(KeyPhrase) DESC FETCH 5 ROWS;

OUTPUT @wordCount
TO "/ReferenceGuide/Cognition/Text/wordCount1.txt"
USING Outputters.Tsv();


// Top 5 key phrases and chapter 
@wordCount =
    SELECT 
            Chapter,
           KeyPhrase,
           COUNT(KeyPhrase) AS PhraseCount
    FROM @KPsplits
    WHERE NOT string.IsNullOrEmpty(KeyPhrase)
    GROUP BY Chapter, KeyPhrase
    ORDER BY COUNT(KeyPhrase) DESC FETCH 5 ROWS;

OUTPUT @wordCount
TO "/ReferenceGuide/Cognition/Text/wordCount2.txt"
USING Outputters.Tsv();

// Top 5 key phrases per chapter 
@wordCount =
    SELECT 
            Chapter,
           KeyPhrase,
           COUNT(KeyPhrase) AS PhraseCount
    FROM @KPsplits
    WHERE NOT string.IsNullOrEmpty(KeyPhrase)
    GROUP BY Chapter, KeyPhrase;

@wordCount =
    SELECT ROW_NUMBER() OVER(PARTITION BY Chapter ORDER BY PhraseCount DESC ) AS RowNumber,
            Chapter,
            KeyPhrase,
           PhraseCount
    FROM @wordCount;

@wordCount =
    SELECT Chapter,
            KeyPhrase,
           PhraseCount
    FROM @wordCount
    WHERE RowNumber < 6;

OUTPUT @wordCount
TO "/ReferenceGuide/Cognition/Text/wordCount3.txt"
USING Outputters.Tsv();

See Also