diff --git a/CHANGELOG.md b/CHANGELOG.md
index 40ead0aa4b..6b58f8b40c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Tutorial on how to load models dynamically
https://intel.github.io/dffml/tutorials/models/load.html
- Added download progressbar in `util/net.py`
+### Changed
+- Calls to hashlib now go through helper functions
### Fixed
- Record object key properties are now always strings
diff --git a/configloader/image/tests/test_config.py b/configloader/image/tests/test_config.py
index a2b181c690..6084b0a989 100644
--- a/configloader/image/tests/test_config.py
+++ b/configloader/image/tests/test_config.py
@@ -1,8 +1,8 @@
import json
import pathlib
-import hashlib
from dffml.util.asynctestcase import AsyncTestCase
+from dffml.util.crypto import secure_hash
from dffml_config_image.configloader import PNGConfigLoader
IMAGE1_HASH = "6faf9050c6d387bc6a68d9e12127f883011add2ec994b8e66c7c0996636f2789af8d28fc11e6528a327a6383c1473e72"
@@ -22,8 +22,8 @@ async def test_dumpb_loadb(self):
/ "image1.png"
).read_bytes()
original = await ctx.loadb(image_bytes)
- hash_original = hashlib.sha384(
- json.dumps(original.flatten().tolist()).encode()
- ).hexdigest()
+ hash_original = secure_hash(
+ json.dumps(original.flatten().tolist()), algorithm="sha384"
+ )
self.assertEqual(original.shape, (280, 280, 3))
self.assertEqual(hash_original, IMAGE1_HASH)
diff --git a/dffml/cli/dataflow.py b/dffml/cli/dataflow.py
index 8edae433f1..00e56251a9 100644
--- a/dffml/cli/dataflow.py
+++ b/dffml/cli/dataflow.py
@@ -1,5 +1,4 @@
import pathlib
-import hashlib
import contextlib
from typing import List, Dict, Any
@@ -27,6 +26,7 @@
KeysCMDConfig,
)
from ..util.cli.parser import ParseInputsAction
+from ..util.crypto import insecure_hash
from ..base import config, field
from ..high_level import run as run_dataflow
@@ -488,45 +488,37 @@ async def run(self):
# Skip stage if not wanted
if self.stages and stage.value not in self.stages:
continue
- stage_node = hashlib.md5(
- ("stage." + stage.value).encode()
- ).hexdigest()
+ stage_node = insecure_hash("stage." + stage.value)
if len(self.stages) != 1:
print(f"subgraph {stage_node}[{stage.value.title()} Stage]")
print(f"style {stage_node} fill:#afd388b5,stroke:#a4ca7a")
for instance_name, operation in dataflow.operations.items():
if operation.stage != stage:
continue
- subgraph_node = hashlib.md5(
- ("subgraph." + instance_name).encode()
- ).hexdigest()
- node = hashlib.md5(instance_name.encode()).hexdigest()
+ subgraph_node = insecure_hash("subgraph." + instance_name)
+ node = insecure_hash(instance_name)
if not self.simple:
print(f"subgraph {subgraph_node}[{instance_name}]")
print(f"style {subgraph_node} fill:#fff4de,stroke:#cece71")
print(f"{node}[{operation.instance_name}]")
for input_name in operation.inputs.keys():
- input_node = hashlib.md5(
- ("input." + instance_name + "." + input_name).encode()
- ).hexdigest()
+ input_node = insecure_hash(
+ "input." + instance_name + "." + input_name
+ )
if not self.simple:
print(f"{input_node}({input_name})")
print(f"{input_node} --> {node}")
for output_name in operation.outputs.keys():
- output_node = hashlib.md5(
- (
- "output." + instance_name + "." + output_name
- ).encode()
- ).hexdigest()
+ output_node = insecure_hash(
+ "output." + instance_name + "." + output_name
+ )
if not self.simple:
print(f"{output_node}({output_name})")
print(f"{node} --> {output_node}")
for condition in operation.conditions:
- condition_node = hashlib.md5(
- (
- "condition." + instance_name + "." + condition.name
- ).encode()
- ).hexdigest()
+ condition_node = insecure_hash(
+ "condition." + instance_name + "." + condition.name
+ )
if not self.simple:
print(f"{condition_node}{'{' + condition.name + '}'}")
print(f"{condition_node} --> {node}")
@@ -541,15 +533,15 @@ async def run(self):
operation = dataflow.operations[instance_name]
if self.stages and not operation.stage.value in self.stages:
continue
- node = hashlib.md5(instance_name.encode()).hexdigest()
+ node = insecure_hash(instance_name)
for input_name, sources in input_flow.inputs.items():
for source in sources:
# TODO Put various sources in their own "Inputs" subgraphs
if isinstance(source, str):
input_definition = operation.inputs[input_name]
- seed_input_node = hashlib.md5(
- (source + "." + input_definition.name).encode()
- ).hexdigest()
+ seed_input_node = insecure_hash(
+ source + "." + input_definition.name
+ )
print(
f"{seed_input_node}({source}
{input_definition.name})"
)
@@ -558,11 +550,9 @@ async def run(self):
f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca"
)
if not self.simple:
- input_node = hashlib.md5(
- (
- "input." + instance_name + "." + input_name
- ).encode()
- ).hexdigest()
+ input_node = insecure_hash(
+ "input." + instance_name + "." + input_name
+ )
print(f"{seed_input_node} --> {input_node}")
else:
print(f"{seed_input_node} --> {node}")
@@ -577,9 +567,9 @@ async def run(self):
origin_definition_name = (
origin + "." + definition_name
)
- seed_input_node = hashlib.md5(
- origin_definition_name.encode()
- ).hexdigest()
+ seed_input_node = insecure_hash(
+ origin_definition_name.encode
+ )
print(
f"{seed_input_node}({source}
{origin_definition_name})"
)
@@ -588,14 +578,9 @@ async def run(self):
f"style {seed_input_node} fill:#f6dbf9,stroke:#a178ca"
)
if not self.simple:
- input_node = hashlib.md5(
- (
- "input."
- + instance_name
- + "."
- + input_name
- ).encode()
- ).hexdigest()
+ input_node = insecure_hash(
+ "input." + instance_name + "." + input_name
+ )
print(f"{seed_input_node} --> {input_node}")
else:
print(f"{seed_input_node} --> {node}")
@@ -612,67 +597,51 @@ async def run(self):
):
source = source[0]
if not self.simple:
- source_output_node = hashlib.md5(
- (
- "output."
- + ".".join(list(source.items())[0])
- ).encode()
- ).hexdigest()
- input_node = hashlib.md5(
- (
- "input." + instance_name + "." + input_name
- ).encode()
- ).hexdigest()
+ source_output_node = insecure_hash(
+ "output." + ".".join(list(source.items())[0])
+ )
+ input_node = insecure_hash(
+ "input." + instance_name + "." + input_name
+ )
+
print(f"{source_output_node} --> {input_node}")
else:
- source_operation_node = hashlib.md5(
- list(source.keys())[0].encode()
- ).hexdigest()
+ source_operation_node = insecure_hash(
+ list(source.keys())[0]
+ )
print(f"{source_operation_node} --> {node}")
for i, condition in enumerate(input_flow.conditions):
if isinstance(condition, str):
if not self.simple:
condition_name = operation.conditions[i].name
- seed_condition_node = hashlib.md5(
- (condition + "." + condition_name).encode()
- ).hexdigest()
+ seed_condition_node = insecure_hash(
+ condition + "." + condition_name
+ )
print(f"{seed_condition_node}({condition_name})")
- seed_dependent_node = hashlib.md5(
- (
- "condition."
- + instance_name
- + "."
- + condition_name
- ).encode()
- ).hexdigest()
+ seed_dependent_node = insecure_hash(
+ "condition." + instance_name + "." + condition_name
+ )
print(
f"{seed_condition_node} --> {seed_dependent_node}"
)
else:
if not self.simple:
- dependee_node = hashlib.md5(
- (
- "output."
- + ".".join(list(condition.items())[0])
- ).encode()
- ).hexdigest()
- dependent_node = hashlib.md5(
- (
- "condition."
- + instance_name
- + "."
- + dataflow.operations[
- list(condition.keys())[0]
- ]
- .outputs[list(condition.values())[0]]
- .name
- ).encode()
- ).hexdigest()
+ dependee_node = insecure_hash(
+ "output." + ".".join(list(condition.items())[0])
+ )
+ dependent_node = insecure_hash(
+ "condition."
+ + instance_name
+ + "."
+ + dataflow.operations[list(condition.keys())[0]]
+ .outputs[list(condition.values())[0]]
+ .name
+ )
print(f"{dependee_node} --> {dependent_node}")
else:
- dependee_operation_node = hashlib.md5(
- list(condition.keys())[0].encode()
- ).hexdigest()
+ dependee_operation_node = insecure_hash(
+ list(condition.keys())[0]
+ )
print(f"{dependee_operation_node} --> {node}")
if len(self.stages) != 1:
print(f"end")
diff --git a/dffml/df/memory.py b/dffml/df/memory.py
index a3df8ce358..876d1ddd9b 100644
--- a/dffml/df/memory.py
+++ b/dffml/df/memory.py
@@ -3,7 +3,6 @@
import copy
import asyncio
import secrets
-import hashlib
import inspect
import itertools
import traceback
@@ -76,6 +75,7 @@
from ..util.cli.arg import Arg
from ..util.data import ignore_args
from ..util.asynchelper import aenter_stack, concurrently
+from ..util.crypto import secure_hash
from .log import LOGGER
@@ -859,7 +859,7 @@ def _unique(instance_name: str, handle: str, *uids: str) -> str:
operation.instance_name, and the sorted list of input uuids.
"""
uid_list = [instance_name, handle] + sorted(uids)
- return hashlib.sha384("".join(uid_list).encode("utf-8")).hexdigest()
+ return secure_hash("".join(uid_list), "sha384")
async def unique(
self, operation: Operation, parameter_set: BaseParameterSet
diff --git a/dffml/model/model.py b/dffml/model/model.py
index b7df86d477..7eb4a2ca1e 100644
--- a/dffml/model/model.py
+++ b/dffml/model/model.py
@@ -7,7 +7,6 @@
"""
import abc
import json
-import hashlib
import pathlib
from typing import AsyncIterator, Optional
diff --git a/dffml/util/crypto.py b/dffml/util/crypto.py
new file mode 100644
index 0000000000..168821d5e4
--- /dev/null
+++ b/dffml/util/crypto.py
@@ -0,0 +1,20 @@
+"""
+All hashing originates in this file for easier auditing.
+"""
+import hashlib
+
+
+SECURE_HASH_ALGORITHM = hashlib.sha384
+INSECURE_HASH_ALGORITHM = hashlib.md5
+
+
+def secure_hash(string, algorithm) -> str:
+ if isinstance(string, str):
+ string = string.encode("utf-8")
+ return SECURE_HASH_ALGORITHM(string).hexdigest()
+
+
+def insecure_hash(string) -> str:
+ if isinstance(string, str):
+ string = string.encode("utf-8")
+ return INSECURE_HASH_ALGORITHM(string).hexdigest()
diff --git a/dffml/util/file.py b/dffml/util/file.py
index f75252ee4f..e1ed894444 100644
--- a/dffml/util/file.py
+++ b/dffml/util/file.py
@@ -1,8 +1,9 @@
import re
-import hashlib
import pathlib
from typing import Union, Tuple
+from .crypto import SECURE_HASH_ALGORITHM
+
class NoHashToUseForValidationSuppliedError(Exception):
"""
@@ -81,7 +82,7 @@ def validate_file_hash(
filepath = pathlib.Path(filepath)
if expected_sha384_hash is None:
raise NoHashToUseForValidationSuppliedError(filepath)
- filehash = hashlib.sha384()
+ filehash = SECURE_HASH_ALGORITHM()
with open(filepath, "rb") as fileobj:
bytes_read = fileobj.read(chunk_size)
filehash.update(bytes_read)
diff --git a/model/scikit/dffml_model_scikit/scikit_base.py b/model/scikit/dffml_model_scikit/scikit_base.py
index 5c0ebcdfb4..a69a2c0362 100644
--- a/model/scikit/dffml_model_scikit/scikit_base.py
+++ b/model/scikit/dffml_model_scikit/scikit_base.py
@@ -4,7 +4,6 @@
Base class for Scikit models
"""
import json
-import hashlib
import pathlib
import logging
import importlib
@@ -30,6 +29,7 @@
from dffml.source.source import Sources, SourcesContext
from dffml.model.model import ModelConfig, ModelContext, Model, ModelNotTrained
from dffml.feature.feature import Features, Feature
+from dffml.util.crypto import secure_hash
class ScikitConfig(ModelConfig, NamedTuple):
@@ -64,9 +64,9 @@ def _feature_predict_hash(self):
if k not in ["features", "tcluster", "predict"]
]
)
- return hashlib.sha384(
- "".join([params] + self.features).encode()
- ).hexdigest()
+ return secure_hash(
+ "".join([params] + self.features), algorithm="sha384"
+ )
@property
def _filepath(self):
diff --git a/model/tensorflow/dffml_model_tensorflow/dnnc.py b/model/tensorflow/dffml_model_tensorflow/dnnc.py
index 1f5b2e41fa..a4771d22a2 100644
--- a/model/tensorflow/dffml_model_tensorflow/dnnc.py
+++ b/model/tensorflow/dffml_model_tensorflow/dnnc.py
@@ -4,7 +4,6 @@
"""
import os
import abc
-import hashlib
import inspect
import pathlib
diff --git a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py
index 5dc3ee0ada..e8530ecf42 100644
--- a/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py
+++ b/model/tensorflow_hub/dffml_model_tensorflow_hub/text_classifier.py
@@ -5,7 +5,6 @@
"""
# TODO Add docstrings
import os
-import hashlib
import pathlib
import importlib
from typing import AsyncIterator, Tuple, Any, List, Type
@@ -17,6 +16,7 @@
from dffml.record import Record
from dffml.model.accuracy import Accuracy
from dffml.util.entrypoint import entrypoint
+from dffml.util.crypto import secure_hash
from dffml.base import config, field
from dffml.feature.feature import Feature, Features
from dffml.source.source import Sources, SourcesContext
@@ -118,7 +118,7 @@ def _model_dir_path(self):
str(len(self.cids)),
self.parent.config.model_path,
]
- model = hashlib.sha384("".join(_to_hash).encode("utf-8")).hexdigest()
+ model = secure_hash("".join(_to_hash), algorithm="sha384")
# Needed to save updated model
if not os.path.isdir(self.parent.config.directory):
raise NotADirectoryError(
diff --git a/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py b/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py
index 607be283dd..71ddfe8744 100644
--- a/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py
+++ b/model/vowpalWabbit/dffml_model_vowpalWabbit/vw_base.py
@@ -5,7 +5,6 @@
"""
import os
import json
-import hashlib
from pathlib import Path
from typing import AsyncIterator, Tuple, Any, List
@@ -19,6 +18,7 @@
from dffml.model.accuracy import Accuracy
from dffml.source.source import Sources, SourcesContext
from dffml.util.entrypoint import entrypoint
+from dffml.util.crypto import secure_hash
from dffml.base import config, field
from dffml.feature.feature import Features, Feature
from dffml.model.model import ModelContext, Model, ModelNotTrained
@@ -141,9 +141,9 @@ def _feature_predict_hash(self):
]
)
params = "".join(params)
- return hashlib.sha384(
- "".join([params] + self.features).encode()
- ).hexdigest()
+ return secure_hash(
+ "".join([params] + self.features), algorithm="sha384"
+ )
def applicable_features(self, features):
usable = []
@@ -449,7 +449,7 @@ def __init__(self, config) -> None:
def _filename(self):
return os.path.join(
self.config.directory,
- hashlib.sha384(self.config.predict.name.encode()).hexdigest()
+ secure_hash(self.config.predict.name, algorithm="sha384")
+ ".json",
)
diff --git a/tests/source/test_idx.py b/tests/source/test_idx.py
index d0def76b40..a1e3df14b7 100644
--- a/tests/source/test_idx.py
+++ b/tests/source/test_idx.py
@@ -1,8 +1,8 @@
import json
import pathlib
-import hashlib
from dffml.util.net import cached_download
+from dffml.util.crypto import secure_hash
from dffml.util.asynctestcase import AsyncTestCase
from dffml.source.idx1 import IDX1SourceConfig, IDX1Source
@@ -51,9 +51,8 @@ async def test_idx3(self, filename):
self.assertIn(feature_name, records[0].features())
for i in range(-1, 1):
with self.subTest(index=i):
- is_hash = hashlib.sha384(
- json.dumps(
- records[i].feature(feature_name)
- ).encode()
- ).hexdigest()
+ is_hash = secure_hash(
+ json.dumps(records[i].feature(feature_name)),
+ algorithm="sha384",
+ )
self.assertEqual(is_hash, IDX3_FIRST_LAST[i])