Author: Pavanmirla@perceptron.solutions

Embedding is just a numeric representation of an entity. Entity could be level in a categorical column, or word in a sentence. Vector [ numeric array representation] of an object/entity is called Embedding

Applications

Model Features

Feature compression

Nearest Neighbor search

Transfer Learning

Step1- Install virtualenv package to isolate your python project

pip install virtualenv

Step2 - Create an environment - say my_python3_environment. Install Python3 in this environment

virtualenv --system-site-packages -p python3 ./my_python3_environment

Step3- Activate this environment to install additional packages

source ./my_python3_environment/bin/activate

Step4- Now you will be in a virtual environment. Now install tensorflow

pip install tensorflow

Create a python file to source your csv data

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

#check if eager mode is enabled
tf.executing_eagerly()

URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

train,test = train_test_split(dataframe,test_size=0.2)
train,val = train_test_split(train,test_size=0.2)
print(len(train))
print(len(val))
print(len(test))
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe= dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size= len(dataframe))
    ds= ds.batch(batch_size)
    return ds 

batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of targets:', label_batch )

feature_columns_all = []
#numeric_cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns_all.append(feature_column.numeric_column(header))
#bucket columns
age = feature_column.numeric_column("age")
age_buckets = feature_column.bucketized_column(age, boundaries = [18,25,30,35, 40,45,50,55,60, 65])
feature_columns_all.append(age_buckets)
)
vocab_thal = dataframe.loc[:,'thal'].unique().tolist()
thal_fc_cat =  feature_column.categorical_column_with_vocabulary_list('thal',vocab_thal)
thal_one_hot = feature_column.indicator_column(thal_fc_cat)
feature_columns_all.append(thal_one_hot)
thal_fc_embedding = feature_column.embedding_column(thal_fc_cat, dimension=8)
feature_columns_all.append(thal_fc_embedding)

crossed_fc= feature_column.crossed_column([age_buckets, thal_fc_cat],hash_bucket_size=10)
crossed_fc_ind = feature_column.indicator_column(crossed_fc)
feature_columns_all.append(crossed_fc_ind)
feature_layer = tf.keras.layers.DenseFeatures(feature_columns_all)
batch_size = 32
train_ds = df_to_dataset(train, batch_size = batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size= batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


import datetime
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
checkpoint_path="logs/checkpoint/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")


tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only= True, verbose=1)



model.fit(train_ds,      validation_data=val_ds,callbacks=[tensorboard_callback,checkpoint_callback ], epochs=5)

tensorboard --logdir logs/fit

feature_layer.get_config()['feature_columns'].keys()
[{'class_name': 'NumericColumn',
  'config': {'key': 'age',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'BucketizedColumn',
  'config': {'source_column': {'class_name': 'NumericColumn',
    'config': {'key': 'age',
     'shape': (1,),
     'default_value': None,
     'dtype': 'float32',
     'normalizer_fn': None}},
   'boundaries': (18, 25, 30, 35, 40, 45, 50, 55, 60, 65)}},
 {'class_name': 'IndicatorColumn',
  'config': {'categorical_column': {'class_name': 'CrossedColumn',
    'config': {'keys': ({'class_name': 'BucketizedColumn',
       'config': {'source_column': {'class_name': 'NumericColumn',
         'config': {'key': 'age',
          'shape': (1,),
          'default_value': None,
          'dtype': 'float32',
          'normalizer_fn': None}},
        'boundaries': (18, 25, 30, 35, 40, 45, 50, 55, 60, 65)}},
      {'class_name': 'VocabularyListCategoricalColumn',
       'config': {'key': 'thal',
        'vocabulary_list': ('fixed', 'normal', 'reversible', '1', '2'),
        'dtype': 'string',
        'default_value': -1,
        'num_oov_buckets': 0}}),
     'hash_bucket_size': 10,
     'hash_key': None}}}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'ca',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'chol',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'oldpeak',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'slope',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'EmbeddingColumn',
  'config': {'categorical_column': {'class_name': 'VocabularyListCategoricalColumn',
    'config': {'key': 'thal',
     'vocabulary_list': ('fixed', 'normal', 'reversible', '1', '2'),
     'dtype': 'string',
     'default_value': -1,
     'num_oov_buckets': 0}},
   'dimension': 8,
   'combiner': 'mean',
   'initializer': {'class_name': 'TruncatedNormal',
    'config': {'mean': 0.0,
     'stddev': 0.35355339059327373,
     'seed': None,
     'dtype': 'float32'}},
   'ckpt_to_load_from': None,
   'tensor_name_in_ckpt': None,
   'max_norm': None,
   'trainable': True}},
 {'class_name': 'IndicatorColumn',
  'config': {'categorical_column': {'class_name': 'VocabularyListCategoricalColumn',
    'config': {'key': 'thal',
     'vocabulary_list': ('fixed', 'normal', 'reversible', '1', '2'),
     'dtype': 'string',
     'default_value': -1,
     'num_oov_buckets': 0}}}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'thalach',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}},
 {'class_name': 'NumericColumn',
  'config': {'key': 'trestbps',
   'shape': (1,),
   'default_value': None,
   'dtype': 'float32',
   'normalizer_fn': None}}]
w =feature_layer.get_weights()[0]

thal_fc_embedding = feature_column.embedding_column(thal_fc_cat, dimension=8)
feature_columns_all.append(thal_fc_embedding)
example_batch = next(iter(train_ds))[0]
all_w = feature_layer(example_batch).numpy()

tf.keras.models.save_model(model, "model_saved_gfce") # without extension .h5, only string of your model
new_model = tf.keras.models.load_model("model_saved_gfce") # as you have saved for the first time, without extension again.


loss, accuracy = new_model.evaluate(test_ds)
print("Accuracy", accuracy)

There is another way to save . Refer to documentation always. Bookmark these pages and get familiar with the structure to navigate. Use the extension model_saved_gfce/00000002 when you save model using tf.keras.models.save_model function call

tf.keras.models.save_model(model, "model_saved_gfce/00000002") # without extension .h5, only string of your model

(base) Pavans-MBP:google_tf_functions_examples pavanmirla$ ls -l model_saved_gfce
total 0
drwxr-xr-x  00000001
drwxr-xr-x  00000002
batch_size_all = len(train)
dataframe_ds = df_to_dataset(train, batch_size= batch_size_all)
example_batch = next(iter(dataframe_ds))[0]
all_w = feature_layer(example_batch).numpy()

all_w_df= pd.DataFrame(all_w)

all_w_df.to_csv("months_embedding_space.csv",sep="\t", index=False, header=False)

This book on reinforcement learning suggested TensorBoardX.

from sklearn.cluster import KMeans

kmeans =  KMeans(n_clusters=10, random_state=0).fit(all_w_df)
groups = kmeans.predict(all_w_df)

groups_df = pd.DataFrame(groups, columns= ['cluster_group'])
groups_df.to_csv("meta_data_embeddings.tsv", sep='\t',index= False, header = False)

Git clone this standalone tensorflow projector

tf.keras.utils.plot_model(model, 'image_output_model.png', show_shapes=True)

Where to find Keras related documentation?

Follow Tensorflow Guide

docker run -p 8500:8500 \
--mount type=bind,source=/Users/pavanmirla/projects/<your_folder>/google_tf_functions_examples/model_saved_gfce,target=/models/model_saved_gfce \
-e MODEL_NAME=model_saved_gfce -t tensorflow/serving &

This format seems to be required for Whatif Tool.Link for the helper functions.

But, how to write it back to a tfrecord file? This code below does not work. Here is t

Example for scenario analysis. Variable values are replaced with average values. Shown below

request_data = { 'instances': 
    [
        {
            'dep_delay': dep_delay,
            'taxiout': taxiout,
            'distance': 160.0,
            'avg_arr_delay': avg_arr_delay,
            'dest': 'CDV'
        }
        for dep_delay, taxiout,avg_arr_delay in 
        [   [ 16.0, 13.0, 67.0],
            [ 13.3, 13.0, 67.0],
            [ 16.0, 16.0, 67.0],
            [ 16.0, 13.0, 4.0]
        ]
    ]
}





#output

request_data
{'instances': [{'dep_delay': 16.0,
   'taxiout': 13.0,
   'distance': 160.0,
   'avg_arr_delay': 67.0,
   'dest': 'CDV'},
  {'dep_delay': 13.3,
   'taxiout': 13.0,
   'distance': 160.0,
   'avg_arr_delay': 67.0,
   'dest': 'CDV'},
  {'dep_delay': 16.0,
   'taxiout': 16.0,
   'distance': 160.0,
   'avg_arr_delay': 67.0,
   'dest': 'CDV'},
  {'dep_delay': 16.0,
   'taxiout': 13.0,
   'distance': 160.0,
   'avg_arr_delay': 4.0,
   'dest': 'CDV'}]}
feature_layer.get_config()['feature_columns'].keys()

Link to code