3_NLP
December 17, 2020
1 Tokenizer
[ ]: from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
'i love my dog',
'I, love my cat',
'You love my dog!'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.word_docs)
{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}
OrderedDict([('i', 2), ('love', 3), ('my', 3), ('dog', 2), ('cat', 1), ('you',
1)])
defaultdict(<class 'int'>, {'my': 3, 'love': 3, 'dog': 2, 'i': 2, 'cat': 1,
'you': 1})
1.1 Text to sequence
[ ]: from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
'I love my dog',
'I love my cat',
'You love my dog!',
'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
1
, word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=6)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)
Word Index = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
Sequences = [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
Padded Sequences:
[[ 0 0 5 3 2 4]
[ 0 0 5 3 2 7]
[ 0 0 6 3 2 4]
[ 6 9 2 4 10 11]]
1.2 Try the trained tokenizer on new data
[ ]: test_data = [
'i really love my dog so much',
'my dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nWord Index = " , word_index)
print("\nTest Sequence = ", test_seq)
padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)
Word Index = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
Test Sequence = [[5, 1, 3, 2, 4, 1, 1], [2, 4, 1, 2, 1]]
Padded Test Sequence:
[[0 0 0 5 1 3 2 4 1 1]
[0 0 0 0 0 2 4 1 2 1]]
2
, 1.3 Tokenizing the Sarcasm dataset
[ ]: !wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.
‹→json \
-O /tmp/sarcasm.json >& /dev/null
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
with open("/tmp/sarcasm.json", 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
urls = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
urls.append(item['article_link'])
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))
print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)
29657
{'<OOV>': 1, 'to': 2, 'of': 3, 'the': 4, 'in': 5, 'for': 6, 'a': 7, 'on': 8,
'and': 9, 'with': 10, 'is': 11, 'new': 12, 'trump': 13, 'man': 14, 'from': 15,
'at': 16, 'about': 17, 'you': 18, 'this': 19, 'by': 20, 'after': 21, 'up': 22,
'out': 23, 'be': 24, 'how': 25, 'as': 26, 'it': 27, 'that': 28, 'not': 29,
'are': 30, 'your': 31, 'his': 32, 'what': 33, 'he': 34, 'all': 35, 'just': 36,
'who': 37, 'has': 38, 'will': 39, 'more': 40, 'one': 41, 'into': 42, 'report':
43, 'year': 44, 'why': 45, 'have': 46, 'area': 47, 'over': 48, 'donald': 49,
'u': 50, 'day': 51, 'says': 52, 's': 53, 'can': 54, 'first': 55, 'woman': 56,
'time': 57, 'like': 58, 'her': 59, "trump's": 60, 'old': 61, 'no': 62, 'get':
63, 'off': 64, 'an': 65, 'life': 66, 'people': 67, 'obama': 68, 'now': 69,
'house': 70, 'still': 71, "'": 72, 'women': 73, 'make': 74, 'was': 75, 'than':
76, 'white': 77, 'back': 78, 'my': 79, 'i': 80, 'clinton': 81, 'down': 82, 'if':
3
, 83, '5': 84, 'when': 85, 'world': 86, 'could': 87, 'we': 88, 'their': 89,
'before': 90, 'americans': 91, 'way': 92, 'do': 93, 'family': 94, 'most': 95,
'gop': 96, 'they': 97, 'study': 98, 'school': 99, "it's": 100, 'black': 101,
'best': 102, 'years': 103, 'bill': 104, 'should': 105, '3': 106, 'him': 107,
'would': 108, 'so': 109, 'police': 110, 'only': 111, 'watch': 112, 'american':
113, 'really': 114, 'being': 115, 'but': 116, 'last': 117, 'know': 118, '10':
119, "can't": 120, 'death': 121, 'home': 122, 'during': 123, 'video': 124,
'finds': 125, 'state': 126, 'or': 127, 'president': 128, 'health': 129, 'going':
130, 'say': 131, 'show': 132, 'nation': 133, 'good': 134, 'things': 135,
'hillary': 136, "'the": 137, 'may': 138, '2': 139, 'against': 140, 'campaign':
141, 'every': 142, 'she': 143, 'love': 144, 'mom': 145, 'need': 146, 'big': 147,
'right': 148, 'party': 149, 'gets': 150, '000': 151, 'too': 152, 'getting': 153,
'these': 154, 'kids': 155, 'some': 156, 'parents': 157, 'work': 158, 'court':
159, 'little': 160, 'change': 161, 'take': 162, 'high': 163, 'makes': 164,
'self': 165, 'our': 166, 'calls': 167, 'john': 168, 'other': 169, 'news': 170,
'through': 171, "doesn't": 172, 'while': 173, "here's": 174, 'never': 175,
'child': 176, 'gay': 177, 'dead': 178, 'look': 179, 'election': 180, 'want':
181, 'own': 182, '4': 183, "don't": 184, 'see': 185, 'takes': 186, 'america':
187, '7': 188, 'local': 189, 'real': 190, 'where': 191, 'next': 192, 'stop':
193, 'even': 194, 'its': 195, "he's": 196, 'war': 197, 'college': 198, 'go':
199, '6': 200, "nation's": 201, 'sex': 202, 'bush': 203, 'made': 204, 'plan':
205, 'office': 206, 'again': 207, 'guy': 208, 'two': 209, 'dad': 210, 'another':
211, 'around': 212, 'dog': 213, 'got': 214, '1': 215, 'million': 216, 'ever':
217, 'week': 218, 'baby': 219, 'debate': 220, 'thing': 221, 'them': 222, 'gun':
223, 'wants': 224, 'care': 225, 'us': 226, 'help': 227, 'much': 228, 'long':
229, 'night': 230, 'congress': 231, 'job': 232, 'finally': 233, 'north': 234,
'been': 235, 'under': 236, "man's": 237, 'actually': 238, 'star': 239,
'national': 240, 'live': 241, 'climate': 242, 'season': 243, 'money': 244,
'couple': 245, "won't": 246, '8': 247, '9': 248, 'top': 249, 'god': 250, 'anti':
251, 'media': 252, 'food': 253, 'ways': 254, '20': 255, 'shows': 256, 'sexual':
257, 'better': 258, 'give': 259, 'shooting': 260, 'had': 261, 'teen': 262,
'face': 263, 'making': 264, 'game': 265, 'paul': 266, 'reveals': 267, 'me': 268,
'trying': 269, 'senate': 270, 'supreme': 271, 'announces': 272, 'there': 273,
'away': 274, 'men': 275, 'history': 276, 'business': 277, 'bad': 278, 'without':
279, 'students': 280, 'everyone': 281, 'attack': 282, 'end': 283, 'story': 284,
'fight': 285, 'facebook': 286, 'son': 287, 'free': 288, 'children': 289,
'enough': 290, 'tv': 291, 'law': 292, 'movie': 293, 'city': 294, 'any': 295,
'introduces': 296, 'pope': 297, 'deal': 298, 'government': 299, 'body': 300,
'part': 301, 'york': 302, '11': 303, 'tell': 304, 'great': 305, 'film': 306,
'does': 307, 'former': 308, 'single': 309, 'entire': 310, 'friends': 311,
'fire': 312, 'call': 313, 'found': 314, 'friend': 315, 'book': 316, 'wedding':
317, 'think': 318, 'come': 319, 'republican': 320, 'must': 321, 'girl': 322,
'find': 323, 'second': 324, 'middle': 325, 'morning': 326, 'support': 327,
'same': 328, 'speech': 329, 'public': 330, 'photos': 331, 'use': 332, 'talk':
333, 'line': 334, 'car': 335, 'sanders': 336, 'name': 337, 'keep': 338,
'thinks': 339, 'run': 340, 'already': 341, 'looking': 342, 'presidential': 343,
'coming': 344, 'james': 345, 'republicans': 346, 'email': 347, "didn't": 348,
'tax': 349, 'pretty': 350, 'case': 351, 'company': 352, 'behind': 353, 'rights':
4
December 17, 2020
1 Tokenizer
[ ]: from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
'i love my dog',
'I, love my cat',
'You love my dog!'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
print(tokenizer.word_counts)
print(tokenizer.word_docs)
{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}
OrderedDict([('i', 2), ('love', 3), ('my', 3), ('dog', 2), ('cat', 1), ('you',
1)])
defaultdict(<class 'int'>, {'my': 3, 'love': 3, 'dog': 2, 'i': 2, 'cat': 1,
'you': 1})
1.1 Text to sequence
[ ]: from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
'I love my dog',
'I love my cat',
'You love my dog!',
'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
1
, word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=6)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)
Word Index = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
Sequences = [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
Padded Sequences:
[[ 0 0 5 3 2 4]
[ 0 0 5 3 2 7]
[ 0 0 6 3 2 4]
[ 6 9 2 4 10 11]]
1.2 Try the trained tokenizer on new data
[ ]: test_data = [
'i really love my dog so much',
'my dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nWord Index = " , word_index)
print("\nTest Sequence = ", test_seq)
padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)
Word Index = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6,
'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
Test Sequence = [[5, 1, 3, 2, 4, 1, 1], [2, 4, 1, 2, 1]]
Padded Test Sequence:
[[0 0 0 5 1 3 2 4 1 1]
[0 0 0 0 0 2 4 1 2 1]]
2
, 1.3 Tokenizing the Sarcasm dataset
[ ]: !wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.
‹→json \
-O /tmp/sarcasm.json >& /dev/null
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
with open("/tmp/sarcasm.json", 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
urls = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
urls.append(item['article_link'])
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))
print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)
29657
{'<OOV>': 1, 'to': 2, 'of': 3, 'the': 4, 'in': 5, 'for': 6, 'a': 7, 'on': 8,
'and': 9, 'with': 10, 'is': 11, 'new': 12, 'trump': 13, 'man': 14, 'from': 15,
'at': 16, 'about': 17, 'you': 18, 'this': 19, 'by': 20, 'after': 21, 'up': 22,
'out': 23, 'be': 24, 'how': 25, 'as': 26, 'it': 27, 'that': 28, 'not': 29,
'are': 30, 'your': 31, 'his': 32, 'what': 33, 'he': 34, 'all': 35, 'just': 36,
'who': 37, 'has': 38, 'will': 39, 'more': 40, 'one': 41, 'into': 42, 'report':
43, 'year': 44, 'why': 45, 'have': 46, 'area': 47, 'over': 48, 'donald': 49,
'u': 50, 'day': 51, 'says': 52, 's': 53, 'can': 54, 'first': 55, 'woman': 56,
'time': 57, 'like': 58, 'her': 59, "trump's": 60, 'old': 61, 'no': 62, 'get':
63, 'off': 64, 'an': 65, 'life': 66, 'people': 67, 'obama': 68, 'now': 69,
'house': 70, 'still': 71, "'": 72, 'women': 73, 'make': 74, 'was': 75, 'than':
76, 'white': 77, 'back': 78, 'my': 79, 'i': 80, 'clinton': 81, 'down': 82, 'if':
3
, 83, '5': 84, 'when': 85, 'world': 86, 'could': 87, 'we': 88, 'their': 89,
'before': 90, 'americans': 91, 'way': 92, 'do': 93, 'family': 94, 'most': 95,
'gop': 96, 'they': 97, 'study': 98, 'school': 99, "it's": 100, 'black': 101,
'best': 102, 'years': 103, 'bill': 104, 'should': 105, '3': 106, 'him': 107,
'would': 108, 'so': 109, 'police': 110, 'only': 111, 'watch': 112, 'american':
113, 'really': 114, 'being': 115, 'but': 116, 'last': 117, 'know': 118, '10':
119, "can't": 120, 'death': 121, 'home': 122, 'during': 123, 'video': 124,
'finds': 125, 'state': 126, 'or': 127, 'president': 128, 'health': 129, 'going':
130, 'say': 131, 'show': 132, 'nation': 133, 'good': 134, 'things': 135,
'hillary': 136, "'the": 137, 'may': 138, '2': 139, 'against': 140, 'campaign':
141, 'every': 142, 'she': 143, 'love': 144, 'mom': 145, 'need': 146, 'big': 147,
'right': 148, 'party': 149, 'gets': 150, '000': 151, 'too': 152, 'getting': 153,
'these': 154, 'kids': 155, 'some': 156, 'parents': 157, 'work': 158, 'court':
159, 'little': 160, 'change': 161, 'take': 162, 'high': 163, 'makes': 164,
'self': 165, 'our': 166, 'calls': 167, 'john': 168, 'other': 169, 'news': 170,
'through': 171, "doesn't": 172, 'while': 173, "here's": 174, 'never': 175,
'child': 176, 'gay': 177, 'dead': 178, 'look': 179, 'election': 180, 'want':
181, 'own': 182, '4': 183, "don't": 184, 'see': 185, 'takes': 186, 'america':
187, '7': 188, 'local': 189, 'real': 190, 'where': 191, 'next': 192, 'stop':
193, 'even': 194, 'its': 195, "he's": 196, 'war': 197, 'college': 198, 'go':
199, '6': 200, "nation's": 201, 'sex': 202, 'bush': 203, 'made': 204, 'plan':
205, 'office': 206, 'again': 207, 'guy': 208, 'two': 209, 'dad': 210, 'another':
211, 'around': 212, 'dog': 213, 'got': 214, '1': 215, 'million': 216, 'ever':
217, 'week': 218, 'baby': 219, 'debate': 220, 'thing': 221, 'them': 222, 'gun':
223, 'wants': 224, 'care': 225, 'us': 226, 'help': 227, 'much': 228, 'long':
229, 'night': 230, 'congress': 231, 'job': 232, 'finally': 233, 'north': 234,
'been': 235, 'under': 236, "man's": 237, 'actually': 238, 'star': 239,
'national': 240, 'live': 241, 'climate': 242, 'season': 243, 'money': 244,
'couple': 245, "won't": 246, '8': 247, '9': 248, 'top': 249, 'god': 250, 'anti':
251, 'media': 252, 'food': 253, 'ways': 254, '20': 255, 'shows': 256, 'sexual':
257, 'better': 258, 'give': 259, 'shooting': 260, 'had': 261, 'teen': 262,
'face': 263, 'making': 264, 'game': 265, 'paul': 266, 'reveals': 267, 'me': 268,
'trying': 269, 'senate': 270, 'supreme': 271, 'announces': 272, 'there': 273,
'away': 274, 'men': 275, 'history': 276, 'business': 277, 'bad': 278, 'without':
279, 'students': 280, 'everyone': 281, 'attack': 282, 'end': 283, 'story': 284,
'fight': 285, 'facebook': 286, 'son': 287, 'free': 288, 'children': 289,
'enough': 290, 'tv': 291, 'law': 292, 'movie': 293, 'city': 294, 'any': 295,
'introduces': 296, 'pope': 297, 'deal': 298, 'government': 299, 'body': 300,
'part': 301, 'york': 302, '11': 303, 'tell': 304, 'great': 305, 'film': 306,
'does': 307, 'former': 308, 'single': 309, 'entire': 310, 'friends': 311,
'fire': 312, 'call': 313, 'found': 314, 'friend': 315, 'book': 316, 'wedding':
317, 'think': 318, 'come': 319, 'republican': 320, 'must': 321, 'girl': 322,
'find': 323, 'second': 324, 'middle': 325, 'morning': 326, 'support': 327,
'same': 328, 'speech': 329, 'public': 330, 'photos': 331, 'use': 332, 'talk':
333, 'line': 334, 'car': 335, 'sanders': 336, 'name': 337, 'keep': 338,
'thinks': 339, 'run': 340, 'already': 341, 'looking': 342, 'presidential': 343,
'coming': 344, 'james': 345, 'republicans': 346, 'email': 347, "didn't": 348,
'tax': 349, 'pretty': 350, 'case': 351, 'company': 352, 'behind': 353, 'rights':
4