
TensorFlow Machine Learning Cookbook
By :

For most of this book, we will rely on the use of datasets to fit machine learning algorithms. This section has instructions on how to access each of these various datasets through TensorFlow and Python.
In TensorFlow some of the datasets that we will use are built in to Python libraries, some will require a Python script to download, and some will be manually downloaded through the Internet. Almost all of these datasets require an active Internet connection to retrieve data.
from sklearn import datasets iris = datasets.load_iris() print(len(iris.data)) 150 print(len(iris.target)) 150 print(iris.target[0]) # Sepal length, Sepal width, Petal length, Petal width [ 5.1 3.5 1.4 0.2] print(set(iris.target)) # I. setosa, I. virginica, I. versicolor {0, 1, 2}
import requests birthdata_url = 'https://www.umass.edu/statdata/statdata/data/lowbwt.dat' birth_file = requests.get(birthdata_url) birth_data = birth_file.text.split('\'r\n') [5:] birth_header = [x for x in birth_data[0].split( '') if len(x)>=1] birth_data = [[float(x) for x in y.split( ')'' if len(x)>=1] for y in birth_data[1:] if len(y)>=1] print(len(birth_data)) 189 print(len(birth_data[0])) 11
import requests housing_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' housing_header = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV0'] housing_file = requests.get(housing_url) housing_data = [[float(x) for x in y.split( '') if len(x)>=1] for y in housing_file.text.split('\n') if len(y)>=1] print(len(housing_data)) 506 print(len(housing_data[0])) 14
from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("MNIST_data/"," one_hot=True) print(len(mnist.train.images)) 55000 print(len(mnist.test.images)) 10000 print(len(mnist.validation.images)) 5000 print(mnist.train.labels[1,:]) # The first label is a 3''' [ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
.zip
file and get the spam-ham text data as follows:import requests import io from zipfile import ZipFile zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip' r = requests.get(zip_url) z = ZipFile(io.BytesIO(r.content)) file = z.read('SMSSpamCollection') text_data = file.decode() text_data = text_data.encode('ascii',errors='ignore') text_data = text_data.decode().split(\n') text_data = [x.split(\t') for x in text_data if len(x)>=1] [text_data_target, text_data_train] = [list(x) for x in zip(*text_data)] print(len(text_data_train)) 5574 print(set(text_data_target)) {'ham', 'spam'} print(text_data_train[1]) Ok lar... Joking wif u oni...
import requests import io import tarfile movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz' r = requests.get(movie_data_url) # Stream data into temp object stream_data = io.BytesIO(r.content) tmp = io.BytesIO() while True: s = stream_data.read(16384) if not s: break tmp.write(s) stream_data.close() tmp.seek(0) # Extract tar file tar_file = tarfile.open(fileobj=tmp, mode="r:gz") pos = tar_file.extractfile('rt'-polaritydata/rt-polarity.pos') neg = tar_file.extractfile('rt'-polaritydata/rt-polarity.neg') # Save pos/neg reviews (Also deal with encoding) pos_data = [] for line in pos: pos_data.append(line.decode('ISO'-8859-1').encode('ascii',errors='ignore').decode()) neg_data = [] for line in neg: neg_data.append(line.decode('ISO'-8859-1').encode('ascii',errors='ignore').decode()) tar_file.close() print(len(pos_data)) 5331 print(len(neg_data)) 5331 # Print out first negative review print(neg_data[0]) simplistic , silly and tedious .
import requests shakespeare_url = 'http://www.gutenberg.org/cache/epub/100/pg100.txt' # Get Shakespeare text response = requests.get(shakespeare_url) shakespeare_file = response.content # Decode binary into string shakespeare_text = shakespeare_file.decode('utf-8') # Drop first few descriptive paragraphs. shakespeare_text = shakespeare_text[7675:] print(len(shakespeare_text)) # Number of characters 5582212
import requests import io from zipfile import ZipFile sentence_url = 'http://www.manythings.org/anki/deu-eng.zip' r = requests.get(sentence_url) z = ZipFile(io.BytesIO(r.content)) file = z.read('deu.txt''') # Format Data eng_ger_data = file.decode() eng_ger_data = eng_ger_data.encode('ascii''',errors='ignore''') eng_ger_data = eng_ger_data.decode().split(\n''') eng_ger_data = [x.split(\t''') for x in eng_ger_data if len(x)>=1] [english_sentence, german_sentence] = [list(x) for x in zip(*eng_ger_data)] print(len(english_sentence)) 137673 print(len(german_sentence)) 137673 print(eng_ger_data[10]) ['I won!, 'Ich habe gewonnen!']
When it comes time to use one of these datasets in a recipe, we will refer you to this section and assume that the data is loaded in such a way as described in the preceding text. If further data transformation or pre-processing is needed, then such code will be provided in the recipe itself.
Change the font size
Change margin width
Change background colour