In this post we will create a sample application both in Go and and Python, and then we will compare the performance of the programming languages.
For the purpose of the comparison, we will create an anonymizer - an application that reads a JSON file, and replaces the private and confidential information with an anonymized text. The anonymizer is a good example for an application, it uses JSON parsing and regular expression, so it uses the core of the related language framework.
The GO Application
The GO application starts with reading a file, JSON parsing it, analyzing the JSON, and printing the modified anonymized JSON result.
func main() {
bytes, err := ioutil.ReadFile("in.json")
if err != nil {
panic(err)
}
var document map[string]interface{}
err = json.Unmarshal(bytes, &document)
if err != nil {
panic(err)
}
recurseMap(document, "")
data, err := json.Marshal(document)
if err != nil {
panic(err)
}
fmt.Print(string(data))
}
To analyze the JSON, we recursively scan all its elements, and if a specific element should be annonymized, we replace the value with the text "PRIVATE"
func recurseMap(element map[string]interface{}, path string) {
for key, value := range element {
newValue := recurseObject(value, path+"/"+key)
if newValue != nil {
element[key] = newValue
}
}
}
func recurseArray(array []interface{}, path string) {
for i, value := range array {
newValue := recurseObject(value, fmt.Sprintf("%v[%v]", path, i))
if newValue != nil {
array[i] = newValue
}
}
}
func recurseObject(value interface{}, path string) interface{} {
childMap, ok := value.(map[string]interface{})
if ok {
recurseMap(childMap, path)
return nil
}
childArray, ok := value.([]interface{})
if ok {
recurseArray(childArray, path)
return nil
}
if isConfidential(value) {
return "PRIVATE"
}
return nil
}
Finally, the actual anonymizer uses regular expressions to locate text that should be anonymized. In this example, we hide IP address, credit card, and location address.
var creditRegex = regexp.MustCompile(`^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})$`)
var addressRegex = regexp.MustCompile(`^\d+\s[A-z]+\s[A-z]+`)
var ipRegex = regexp.MustCompile(`^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$`)
func isConfidential(value interface{}) bool {
valueString, ok := value.(string)
if !ok {
return false
}
if creditRegex.MatchString(valueString) {
return true
}
if addressRegex.MatchString(valueString) {
return true
}
if ipRegex.MatchString(valueString) {
return true
}
return false
}
The Python Application
The python application starts with reading a file, JSON parsing it, analyzing the JSON, and printing the modified anonymized JSON result.
with open('in.json', 'r') as f:
data = f.read()
document = json.loads(data)
recurse_map(document, "")
data = json.dumps(document)
print(data)
To analyze the JSON, we recursively scan all its elements, and if a specific element should be annonymized, we replace the value with the text "PRIVATE"
def recurse_object(element, path):
if type(element) is dict:
recurse_map(element, path)
return None
if type(element) is list:
recurse_list(element, path)
return None
if is_confidential(element):
return "PRIVATE"
return None
def recurse_list(list, path):
for i, value in enumerate(list):
new_value = recurse_object(value, "{}/[{}]".format(path, i))
if new_value is not None:
list[i] = new_value
def recurse_map(element, path):
for key in element.keys():
value = element[key]
new_value = recurse_object(value, path + "/" + key)
if new_value is not None:
element[key] = new_value
Finally, the actual anonymizer uses regular expressions to locate text that should be anonymized. In this example, we hide IP address, credit card, and location address.
credit_regex = re.compile(
'^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\\d{3})\\d{11})$')
address_regex = re.compile('^\\d+\\s[A-z]+\\s[A-z]+')
ip_regex = re.compile('^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
def is_confidential(value):
if type(value) is not str:
return False
if credit_regex.match(value):
return True
if address_regex.match(value):
return True
if ip_regex.match(value):
return True
return False
The Performance Tests Results
To test the performance of the application, we use a 1 megabytes JSON file, and we run the JSON parse and the anonymize multiple times.
Go test code:
var data []byte
startTime := time.Now()
for i := 0; i < 500; i++ {
var document map[string]interface{}
err = json.Unmarshal(bytes, &document)
if err != nil {
panic(err)
}
recurseMap(document, "")
data, err = json.Marshal(document)
if err != nil {
panic(err)
}
}
passed := time.Now().Sub(startTime)
fmt.Printf("%v\n", passed)
fmt.Print(string(data))
Python test code:
with open('in.json', 'r') as f:
data = f.read()
start_time = datetime.datetime.now()
for i in range(500):
document = json.loads(data)
recurse_map(document, "")
data = json.dumps(document)
delta = datetime.datetime.now() - start_time
print(delta)
Edit:
I've rerun the tests, and apparently I was using different files for GO and python :(
After using the same JSON input files, I've found the the results are almost identical.
No comments:
Post a Comment