Add the possibility to transform the data: normalize/standardize

zarch · Aug 20, 2013 · 77f3d8f · 77f3d8f
1 parent 1d17668
commit 77f3d8f
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 24 deletions.
diff --git a/libmlcls/Makefile b/libmlcls/Makefile
@@ -3,7 +3,8 @@ MODULE_TOPDIR = ../../..
 include $(MODULE_TOPDIR)/include/Make/Other.make
 include $(MODULE_TOPDIR)/include/Make/Python.make
 
-MODULES = mlchk mlclassify mlconf mlplts mlsegment mlstats mltraining mlwriterast
+MODULES = mlchk mlclassify mlconf mlplts mlsegment mlstats mltraining \
+          mlwriterast mltransform
 
 ETCDIR = $(ETC)/ml.class
 

diff --git a/libmlcls/mlstats.py b/libmlcls/mlstats.py
@@ -69,6 +69,5 @@ def statistics(group, zones, ratio, hdf, stat_name, stat_results):
     df = pnl2df(get_pnl(gmaps, stat_name))
     # 'photo_r_mean', 'photo_g_mean', 'photo_b_mean'
     add_ratio(df, ratio)
-    df_stand = (df - df.mean()) / df.std()
-    df_stand.to_hdf(hdf, str(stat_results))
-    return df_stand
+    df.to_hdf(hdf, str(stat_results))
+    return df
diff --git a/libmlcls/mltransform.py b/libmlcls/mltransform.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Aug 16 14:44:30 2013
+
+@author: pietro
+"""
+
+
+#
+# TRANSFORMATION FUNCTIONS
+#
+def stdize(df, mean, std):
+    return (df - mean) / std
+
+
+def normalize(df, xmin, xmax):
+    return (df - xmin) / (xmax - xmin)
+
+
+def transform(method, data, chk):
+    if method == 'standardize':
+        mean, std = chk.mean(), chk.std()
+        return stdize(data, mean, std), stdize(data, mean, std)
+    elif method == 'normalize':
+        xmin, xmax = chk.min(), chk.max()
+        return normalize(data, xmin, xmax), normalize(chk, xmin, xmax)
+    elif method == 'nothing':
+        return data, chk
+    else:
+        raise ValueError('method: <%s> not valid' % method)
diff --git a/ml.classify/ml.classify.py b/ml.classify/ml.classify.py
@@ -34,7 +34,7 @@
 #%  type: string
 #%  description: Name of the statistic data contained in the HDF file
 #%  required: yes
-#%  answer: K_all
+#%  answer: data
 #%end
 #%option
 #%  key: training_json
@@ -45,11 +45,11 @@
 #%  answer: training.json
 #%end
 #%option
-#%  key: training_Kchk
+#%  key: training_kchk
 #%  type: string
 #%  description: Name for the training values in the HDF
 #%  required: no
-#%  answer: K_chk
+#%  answer:
 #%  guisection: Training
 #%end
 #%option
@@ -113,6 +113,14 @@
 #%  required: no
 #%  answer: classify_using_%s
 #%end
+#%option
+#%  key: transform
+#%  type: string
+#%  options: nothing,standardize,normalize
+#%  description: Choose the traformation function of the statistics data of the segments
+#%  required: no
+#%  answer: nothing
+#%end
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 import json
@@ -139,39 +147,37 @@
 sys.path.append(path)
 from mlchk import check_classification
 from mltraining import extract_itr, extract_training
+from mltransform import transform
 from mlclassify import mls_classification
 
 
 def main(opts, flgs):
     if not opts['training_hdf']:
         opts['training_hdf'] = opts['hdf']
 
-    #import ipdb; ipdb.set_trace()
-    K_all = pnd.read_hdf(opts['hdf'], str(opts['data']))
+    data = pnd.read_hdf(opts['hdf'], str(opts['data']))
 
-    if opts['training_Kchk'] and opts['training_ychk']:
-        Kchk = pnd.read_hdf(opts['training_hdf'], str(opts['training_Kchk']))
+    if opts['training_kchk'] and opts['training_ychk']:
+        Kchk = pnd.read_hdf(opts['training_hdf'], str(opts['training_kchk']))
         ychk = pnd.read_hdf(opts['training_hdf'], str(opts['training_ychk']))
         itr = extract_itr(Kchk, ychk, int(opts['training_number']))
     else:
         with open(opts['training_json'], 'r') as fp:
             tr = json.load(fp)
-            itr, Kchk, ychk = extract_training(tr, K_all,
+            check_classification(tr)
+            itr, Kchk, ychk = extract_training(tr, data,
                                                int(opts['training_number']))
-    Kchk = pnd.DataFrame(Kchk.copy().tolist(),
-                         index=Kchk.index,
-                         columns=Kchk.iloc[0].index)
-    K_chk, y_chk = Kchk.loc[itr], ychk.loc[itr]
+#    Kchk = pnd.DataFrame(Kchk.copy().tolist(),
+#                         index=Kchk.index,
+#                         columns=Kchk.iloc[0].index)
     conf = imp.load_source("conf", opts['training_conf'])
     mls = getattr(conf, opts['training_mls'])
     key = None if opts['training_key'] == '' else opts['training_key']
-    with open(opts['training_json'], 'r') as fp:
-        tr = json.load(fp)
-        check_classification(tr)
-        # mls_classification(K_all, K_chk, y_chk, mls, hdf, out_class)
-        mls_classification(K_all, K_chk, y_chk, mls,
-                           opts['hdf'], opts['out_class'],
-                           key=key)
+    tdata, tKchk = transform(opts['transform'], data, Kchk)
+    tK_chk, y_chk = tKchk.loc[itr], ychk.loc[itr]
+    # mls_classification(data, K_chk, y_chk, mls, hdf, out_class)
+    mls_classification(tdata, tK_chk, y_chk, mls,
+                       opts['hdf'], opts['out_class'], key=key)
 
 
 if __name__ == "__main__":
@@ -183,6 +189,7 @@ def main(opts, flgs):
 training_conf=mlconf.py \
 training_mls=BEST \
 training_hdf=/home/pietro/docdat/phd/edinburgh/segSVM/segments-ml/data.hdf \
+training_kchk=K_chk \
 training_ychk=y_chk \
 hdf=results.hdf
 

diff --git a/ml.segstats/ml.segstats.py b/ml.segstats/ml.segstats.py
@@ -112,7 +112,9 @@ def main(opts, flgs):
         segment(opts['thrs'], opts['group'], opts['seg_opts'],
                 opts['seg_name'])
     if 'r' in flgs and flgs['r']:
-        statistics(opts['group'], opts['seg_name'] % opts['thrs'][-1],
+        sname = opts['seg_name'] % opts['thrs'][-1] \
+                if '%' in opts['seg_name'] else opts['seg_name']
+        statistics(opts['group'], sname,
                    opts['stat_ratio_cols'].split(','), opts['hdf'],
                    opts['stat_name'], opts['stat_results'])