src.private_count_sketch.private_cs_client

  1from sympy import primerange
  2import random
  3import numpy as np
  4import importlib.util
  5import os
  6import argparse
  7import time
  8from progress.bar import Bar
  9from tabulate import tabulate
 10import sys
 11import pandas as pd
 12import pickle
 13import statistics
 14
 15from utils.utils import load_dataset, generate_hash_functions, display_results, generate_error_table, generate_hash_function_G
 16
 17class privateCSClient:
 18    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
 19        self.dataset_name = dataset_name
 20        self.epsilon = epsilon
 21        self.k = k
 22        self.m = m
 23        self.dataset = dataset
 24        self.domain = domain
 25        self.N = len(dataset)
 26
 27        # Creation of the sketch matrix
 28        self.M = np.zeros((self.k, self.m))
 29
 30        # List to store the privatized matrices
 31        self.client_matrix = []
 32
 33        # Definition of the hash family 3 by 3
 34        primes = list(primerange(10**6, 10**7))
 35        p = primes[random.randint(0, len(primes)-1)]
 36        self.H = generate_hash_functions(self.k,p, 3,self.m)
 37
 38        #Definition of the hash family 4 by 4
 39        prime = 2**61 - 1
 40        self.G = generate_hash_function_G(self.k, prime)
 41
 42    
 43    def bernoulli_vector(self):
 44        b = np.random.binomial(1, (np.exp(self.epsilon / 2) / (np.exp(self.epsilon / 2) + 1)), self.m)
 45        b = 2 * b - 1 
 46        return b
 47
 48    def client(self, d):
 49        j = random.randint(0, self.k-1)
 50        v = np.zeros(self.m)
 51
 52        v[self.H[j](d)] = 1 * self.G[j](d)
 53        
 54        b = self.bernoulli_vector()
 55        v_aux = v * b 
 56
 57        self.client_matrix.append((v_aux,j))
 58        return v_aux, j
 59
 60    def update_sketch_matrix(self, v, j):
 61        c_e = (np.exp(self.epsilon / 2) + 1) / (np.exp(self.epsilon / 2) - 1)
 62        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
 63        for i in range (self.m):
 64            self.M[j,i] += x[i] 
 65
 66    def estimate_client(self, d):
 67        median_vector = []
 68        for i in range(self.k):
 69            median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d))
 70        median = statistics.median(median_vector)
 71
 72        #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m))
 73        f_estimated = (self.m/(self.m-1))*(median)
 74        return f_estimated
 75    
 76    def execute_client(self):
 77        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
 78        privatized_data = []
 79        for d in self.dataset:
 80            v_i, j_i = self.client(d)
 81            privatized_data.append((v_i, j_i))
 82            bar.next()
 83        bar.finish()
 84        
 85        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
 86
 87        data_dict = df_client_matrix.to_dict(orient='list')
 88
 89        script_dir = os.path.dirname(os.path.abspath(__file__))
 90        output_dir = os.path.join(script_dir, "../../data/privatized")
 91
 92        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
 93    
 94        with open(output_file, 'wb') as f:
 95            pickle.dump(privatized_data, f)
 96    
 97        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
 98        return privatized_data
 99    
100    def server_simulator(self,privatized_data):
101        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
102        
103        for data in privatized_data:
104            self.update_sketch_matrix(data[0],data[1])
105            bar.next()
106        bar.finish()
107
108        F_estimated = {}
109        for x in self.domain:
110            F_estimated[x] = self.estimate_client(x)
111            bar.next()
112        bar.finish()
113        return F_estimated, self.H, self.G
114
115def run_private_cs_client(k, m, e, d):
116    dataset, df, domain = load_dataset(f"{d}_filtered")
117
118    # Initialize the private Count-Mean Sketch
119    PCMS = privateCSClient(e, k, m, dataset, domain, d)
120
121    # Client side: process the private data
122    privatized_data = PCMS.execute_client()
123
124    # Simulate the server side
125    f_estimated, H, G = PCMS.server_simulator(privatized_data)
126
127    # Save f_estimated to a file
128    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
129
130    script_dir = os.path.dirname(os.path.abspath(__file__))
131    output_dir = os.path.join(script_dir, "../../data/frequencies")
132    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
133
134    # Show the results
135    data_table = display_results(df, f_estimated)
136    return H, data_table, G
class privateCSClient:
 18class privateCSClient:
 19    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
 20        self.dataset_name = dataset_name
 21        self.epsilon = epsilon
 22        self.k = k
 23        self.m = m
 24        self.dataset = dataset
 25        self.domain = domain
 26        self.N = len(dataset)
 27
 28        # Creation of the sketch matrix
 29        self.M = np.zeros((self.k, self.m))
 30
 31        # List to store the privatized matrices
 32        self.client_matrix = []
 33
 34        # Definition of the hash family 3 by 3
 35        primes = list(primerange(10**6, 10**7))
 36        p = primes[random.randint(0, len(primes)-1)]
 37        self.H = generate_hash_functions(self.k,p, 3,self.m)
 38
 39        #Definition of the hash family 4 by 4
 40        prime = 2**61 - 1
 41        self.G = generate_hash_function_G(self.k, prime)
 42
 43    
 44    def bernoulli_vector(self):
 45        b = np.random.binomial(1, (np.exp(self.epsilon / 2) / (np.exp(self.epsilon / 2) + 1)), self.m)
 46        b = 2 * b - 1 
 47        return b
 48
 49    def client(self, d):
 50        j = random.randint(0, self.k-1)
 51        v = np.zeros(self.m)
 52
 53        v[self.H[j](d)] = 1 * self.G[j](d)
 54        
 55        b = self.bernoulli_vector()
 56        v_aux = v * b 
 57
 58        self.client_matrix.append((v_aux,j))
 59        return v_aux, j
 60
 61    def update_sketch_matrix(self, v, j):
 62        c_e = (np.exp(self.epsilon / 2) + 1) / (np.exp(self.epsilon / 2) - 1)
 63        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
 64        for i in range (self.m):
 65            self.M[j,i] += x[i] 
 66
 67    def estimate_client(self, d):
 68        median_vector = []
 69        for i in range(self.k):
 70            median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d))
 71        median = statistics.median(median_vector)
 72
 73        #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m))
 74        f_estimated = (self.m/(self.m-1))*(median)
 75        return f_estimated
 76    
 77    def execute_client(self):
 78        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
 79        privatized_data = []
 80        for d in self.dataset:
 81            v_i, j_i = self.client(d)
 82            privatized_data.append((v_i, j_i))
 83            bar.next()
 84        bar.finish()
 85        
 86        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
 87
 88        data_dict = df_client_matrix.to_dict(orient='list')
 89
 90        script_dir = os.path.dirname(os.path.abspath(__file__))
 91        output_dir = os.path.join(script_dir, "../../data/privatized")
 92
 93        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
 94    
 95        with open(output_file, 'wb') as f:
 96            pickle.dump(privatized_data, f)
 97    
 98        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
 99        return privatized_data
100    
101    def server_simulator(self,privatized_data):
102        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
103        
104        for data in privatized_data:
105            self.update_sketch_matrix(data[0],data[1])
106            bar.next()
107        bar.finish()
108
109        F_estimated = {}
110        for x in self.domain:
111            F_estimated[x] = self.estimate_client(x)
112            bar.next()
113        bar.finish()
114        return F_estimated, self.H, self.G
privateCSClient(epsilon, k, m, dataset, domain, dataset_name)
19    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
20        self.dataset_name = dataset_name
21        self.epsilon = epsilon
22        self.k = k
23        self.m = m
24        self.dataset = dataset
25        self.domain = domain
26        self.N = len(dataset)
27
28        # Creation of the sketch matrix
29        self.M = np.zeros((self.k, self.m))
30
31        # List to store the privatized matrices
32        self.client_matrix = []
33
34        # Definition of the hash family 3 by 3
35        primes = list(primerange(10**6, 10**7))
36        p = primes[random.randint(0, len(primes)-1)]
37        self.H = generate_hash_functions(self.k,p, 3,self.m)
38
39        #Definition of the hash family 4 by 4
40        prime = 2**61 - 1
41        self.G = generate_hash_function_G(self.k, prime)
dataset_name
epsilon
k
m
dataset
domain
N
M
client_matrix
H
G
def bernoulli_vector(self):
44    def bernoulli_vector(self):
45        b = np.random.binomial(1, (np.exp(self.epsilon / 2) / (np.exp(self.epsilon / 2) + 1)), self.m)
46        b = 2 * b - 1 
47        return b
def client(self, d):
49    def client(self, d):
50        j = random.randint(0, self.k-1)
51        v = np.zeros(self.m)
52
53        v[self.H[j](d)] = 1 * self.G[j](d)
54        
55        b = self.bernoulli_vector()
56        v_aux = v * b 
57
58        self.client_matrix.append((v_aux,j))
59        return v_aux, j
def update_sketch_matrix(self, v, j):
61    def update_sketch_matrix(self, v, j):
62        c_e = (np.exp(self.epsilon / 2) + 1) / (np.exp(self.epsilon / 2) - 1)
63        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
64        for i in range (self.m):
65            self.M[j,i] += x[i] 
def estimate_client(self, d):
67    def estimate_client(self, d):
68        median_vector = []
69        for i in range(self.k):
70            median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d))
71        median = statistics.median(median_vector)
72
73        #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m))
74        f_estimated = (self.m/(self.m-1))*(median)
75        return f_estimated
def execute_client(self):
77    def execute_client(self):
78        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
79        privatized_data = []
80        for d in self.dataset:
81            v_i, j_i = self.client(d)
82            privatized_data.append((v_i, j_i))
83            bar.next()
84        bar.finish()
85        
86        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
87
88        data_dict = df_client_matrix.to_dict(orient='list')
89
90        script_dir = os.path.dirname(os.path.abspath(__file__))
91        output_dir = os.path.join(script_dir, "../../data/privatized")
92
93        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
94    
95        with open(output_file, 'wb') as f:
96            pickle.dump(privatized_data, f)
97    
98        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
99        return privatized_data
def server_simulator(self, privatized_data):
101    def server_simulator(self,privatized_data):
102        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
103        
104        for data in privatized_data:
105            self.update_sketch_matrix(data[0],data[1])
106            bar.next()
107        bar.finish()
108
109        F_estimated = {}
110        for x in self.domain:
111            F_estimated[x] = self.estimate_client(x)
112            bar.next()
113        bar.finish()
114        return F_estimated, self.H, self.G
def run_private_cs_client(k, m, e, d):
116def run_private_cs_client(k, m, e, d):
117    dataset, df, domain = load_dataset(f"{d}_filtered")
118
119    # Initialize the private Count-Mean Sketch
120    PCMS = privateCSClient(e, k, m, dataset, domain, d)
121
122    # Client side: process the private data
123    privatized_data = PCMS.execute_client()
124
125    # Simulate the server side
126    f_estimated, H, G = PCMS.server_simulator(privatized_data)
127
128    # Save f_estimated to a file
129    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
130
131    script_dir = os.path.dirname(os.path.abspath(__file__))
132    output_dir = os.path.join(script_dir, "../../data/frequencies")
133    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
134
135    # Show the results
136    data_table = display_results(df, f_estimated)
137    return H, data_table, G