Skip to content

Commit

Permalink
Fix issue with conflicting max_ids (#549)
Browse files Browse the repository at this point in the history
* Fix issue with conflicting max_ids

* Always set PrivateReader to match latest query

* Fix unit test

* Fix for CI

* CI fix

---------

Co-authored-by: Joshua <[email protected]>
  • Loading branch information
joshua-oss and joshua-oss committed Apr 24, 2023
1 parent 081be28 commit f20614d
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 8 deletions.
46 changes: 46 additions & 0 deletions datasets/PUMS_dup_twotable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"":
PUMS:
PUMS:
max_ids: 1
rows: 1000
age:
type: int
lower: 0
upper: 100
sex:
type: string
educ:
type: string
race:
type: string
income:
type: int
lower: 0
upper: 500000
married:
type: string
pid:
type: int
private_id: True
PUMS2:
max_ids: 3
rows: 1000
age:
type: int
lower: 0
upper: 100
sex:
type: string
educ:
type: string
race:
type: string
income:
type: int
lower: 0
upper: 500000
married:
type: string
pid:
type: int
private_id: True
46 changes: 46 additions & 0 deletions datasets/PUMS_dup_twotable_reverse.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"":
PUMS:
PUMS2:
max_ids: 3
rows: 1000
age:
type: int
lower: 0
upper: 100
sex:
type: string
educ:
type: string
race:
type: string
income:
type: int
lower: 0
upper: 500000
married:
type: string
pid:
type: int
private_id: True
PUMS:
max_ids: 1
rows: 1000
age:
type: int
lower: 0
upper: 100
sex:
type: string
educ:
type: string
race:
type: string
income:
type: int
lower: 0
upper: 500000
married:
type: string
pid:
type: int
private_id: True
4 changes: 1 addition & 3 deletions sql/snsql/sql/private_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ def _refresh_options(self):
self._options.censor_dims = not any([not t.censor_dims for t in tables])
self._options.reservoir_sample = any([t.sample_max_ids for t in tables])
self._options.clamp_counts = any([t.clamp_counts for t in tables])
self._options.max_contrib = max([t.max_ids for t in tables])
self._options.use_dpsu = any([t.use_dpsu for t in tables])
self._options.clamp_columns = any([t.clamp_columns for t in tables])

Expand Down Expand Up @@ -280,8 +279,7 @@ def _rewrite_ast(self, query):
if isinstance(query, str):
raise ValueError("Please pass a Query AST object to _rewrite_ast()")
query_max_contrib = query.max_ids
if self._options.max_contrib is None or self._options.max_contrib > query_max_contrib:
self._options.max_contrib = query_max_contrib
self._options.max_contrib = query_max_contrib

self._refresh_options()
query = self.rewriter.query(query)
Expand Down
21 changes: 20 additions & 1 deletion sql/tests/engines/test_db_count.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import os
import subprocess
import pytest
import sys
from snsql import Privacy, from_connection

from snsql.sql.privacy import Privacy
git_root_dir = subprocess.check_output("git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()

two_table_meta_a = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_dup_twotable.yaml"))
two_table_meta_b = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_dup_twotable_reverse.yaml"))

privacy = Privacy(alphas=[0.01, 0.05], epsilon=30.0, delta=0.1)

Expand Down Expand Up @@ -31,6 +37,19 @@ def test_db_counts(self, test_databases):
upper = 1224000
print(f"Table {dbname}.PUMS.{tablename} has {n} COUNT(age) rows in {reader.engine}")
assert(n > lower and n < upper)
def test_with_two_table_meta(self, test_databases):
for engine in ['postgres', 'sqlserver']:
dbdataset = test_databases.get_connection(database='PUMS_null', engine=engine)
if dbdataset is not None:
# pandas doesn't support multiple tables per metadata
table_name = dbdataset.table_name
conn = dbdataset.connection
if table_name.upper() == 'PUMS.PUMS':
query = f'SELECT COUNT(age) FROM PUMS.PUMS'
for meta in [two_table_meta_a, two_table_meta_b]:
reader = from_connection(conn, metadata=meta, privacy=privacy)
count_age = reader.execute(query)[1][0]
assert count_age > 890 and count_age < 1020
def test_db_counts_star(self, test_databases):
# Actual is 1000
for dbname in ['PUMS', 'PUMS_pid', 'PUMS_large', 'PUMS_dup', 'PUMS_null']:
Expand Down
5 changes: 2 additions & 3 deletions sql/tests/query/test_thresholds.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,6 @@ def test_check_thresholds_gauss(self):
deltas = [10E-5, 10E-15]
query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
reader = PandasReader(df, schema)
qp = QueryParser(schema)
q = qp.query(query)
for eps in epsilons:
for d in max_contribs:
for delta in deltas:
Expand All @@ -108,8 +106,9 @@ def test_check_thresholds_gauss(self):
gaus_rho = 1 + gaus_scale * math.sqrt(2 * math.log(d / math.sqrt(2 * math.pi * delta)))
schema_c = copy.copy(schema)
schema_c["PUMS.PUMS"].max_ids = d
qp = QueryParser(schema_c)
q = qp.query(query)
private_reader = PrivateReader(reader, metadata=schema_c, privacy=privacy)
assert(private_reader._options.max_contrib == d)
r = private_reader._execute_ast(q)
assert(private_reader.tau < gaus_rho * 3 and private_reader.tau > gaus_rho / 3)
def test_empty_result_count_typed_notau_prepost(self):
Expand Down
2 changes: 1 addition & 1 deletion sql/tests/setup/dataloader/factories/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ def connect(self, dataset):
print(f'Postgres: Connected {dataset} to {dbname} as {table_name}')
except Exception as e:
print(str(e))
print(f"Unable to connect to postgres datset {dataset}. Ensure connection info is correct and psycopg2 is installed")
print(f"Unable to connect to postgres dataset {dataset}. Ensure connection info is correct and psycopg2 is installed")

0 comments on commit f20614d

Please sign in to comment.