Wednesday, October 21, 2020

Anonymizer Performance Compare GoLang vs. Python



In this post we will create a sample application both in Go and and Python, and then we will compare the performance of the programming languages.

For the purpose of the comparison, we will create an anonymizer - an application that reads a JSON file, and replaces the private and confidential information with an anonymized text. The anonymizer is a good example for an application, it uses JSON parsing and regular expression, so it uses the core of the related language framework.



The GO Application


The GO application starts with reading a file, JSON parsing it, analyzing the JSON, and printing the modified anonymized JSON result.


func main() {
bytes, err := ioutil.ReadFile("in.json")
if err != nil {
panic(err)
}

var document map[string]interface{}
err = json.Unmarshal(bytes, &document)
if err != nil {
panic(err)
}

recurseMap(document, "")

data, err := json.Marshal(document)
if err != nil {
panic(err)
}

fmt.Print(string(data))
}


To analyze the JSON, we recursively scan all its elements, and if a specific element should be annonymized, we replace the value with the text "PRIVATE"


func recurseMap(element map[string]interface{}, path string) {
for key, value := range element {
newValue := recurseObject(value, path+"/"+key)
if newValue != nil {
element[key] = newValue
}
}
}

func recurseArray(array []interface{}, path string) {
for i, value := range array {
newValue := recurseObject(value, fmt.Sprintf("%v[%v]", path, i))
if newValue != nil {
array[i] = newValue
}
}
}

func recurseObject(value interface{}, path string) interface{} {
childMap, ok := value.(map[string]interface{})
if ok {
recurseMap(childMap, path)
return nil
}

childArray, ok := value.([]interface{})
if ok {
recurseArray(childArray, path)
return nil
}

if isConfidential(value) {
return "PRIVATE"
}

return nil
}



Finally, the actual anonymizer uses regular expressions to locate text that should be anonymized. In this example, we hide IP address, credit card, and location address.


var creditRegex = regexp.MustCompile(`^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})$`)
var addressRegex = regexp.MustCompile(`^\d+\s[A-z]+\s[A-z]+`)
var ipRegex = regexp.MustCompile(`^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$`)

func isConfidential(value interface{}) bool {
valueString, ok := value.(string)
if !ok {
return false
}

if creditRegex.MatchString(valueString) {
return true
}

if addressRegex.MatchString(valueString) {
return true
}

if ipRegex.MatchString(valueString) {
return true
}

return false
}


The Python Application


The python application starts with reading a file, JSON parsing it, analyzing the JSON, and printing the modified anonymized JSON result.


with open('in.json', 'r') as f:
data = f.read()
document = json.loads(data)
recurse_map(document, "")
data = json.dumps(document)
print(data)

To analyze the JSON, we recursively scan all its elements, and if a specific element should be annonymized, we replace the value with the text "PRIVATE"


def recurse_object(element, path):
if type(element) is dict:
recurse_map(element, path)
return None

if type(element) is list:
recurse_list(element, path)
return None

if is_confidential(element):
return "PRIVATE"

return None


def recurse_list(list, path):
for i, value in enumerate(list):
new_value = recurse_object(value, "{}/[{}]".format(path, i))
if new_value is not None:
list[i] = new_value


def recurse_map(element, path):
for key in element.keys():
value = element[key]
new_value = recurse_object(value, path + "/" + key)
if new_value is not None:
element[key] = new_value


Finally, the actual anonymizer uses regular expressions to locate text that should be anonymized. In this example, we hide IP address, credit card, and location address.


credit_regex = re.compile(
'^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\\d{3})\\d{11})$')
address_regex = re.compile('^\\d+\\s[A-z]+\\s[A-z]+')
ip_regex = re.compile('^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')


def is_confidential(value):
if type(value) is not str:
return False

if credit_regex.match(value):
return True
if address_regex.match(value):
return True
if ip_regex.match(value):
return True

return False



The Performance Tests Results


To test the performance of the application, we use a 1 megabytes JSON file, and we run the JSON parse and the anonymize multiple times.

Go test code:


var data []byte
startTime := time.Now()
for i := 0; i < 500; i++ {

var document map[string]interface{}
err = json.Unmarshal(bytes, &document)
if err != nil {
panic(err)
}

recurseMap(document, "")

data, err = json.Marshal(document)
if err != nil {
panic(err)
}

}

passed := time.Now().Sub(startTime)
fmt.Printf("%v\n", passed)
fmt.Print(string(data))



Python test code:


with open('in.json', 'r') as f:
data = f.read()
start_time = datetime.datetime.now()
for i in range(500):
document = json.loads(data)
recurse_map(document, "")
data = json.dumps(document)
delta = datetime.datetime.now() - start_time
print(delta)


The python code is complete within ~32 seconds, and the Go code is complete within ~15 milliseconds.


Edit:

I've rerun the tests, and apparently I was using different files for GO and python :(
After using the same JSON input files, I've found the the results are almost identical. 



No comments:

Post a Comment