Bug 1083971 - Add an option to output a binary file for the PSL data r=leplatrem,erahm
authorArpit Bharti <arpitbharti73@gmail.com>
Tue, 02 Jul 2019 12:28:48 +0000
changeset 540575 822cb68b6ab75c96d7e36aa1f7fffda122d41f0c
parent 540574 99f94dd8c8f1f0ca13196051cf608b420b5df731
child 540576 dee2008c7a7d05bc882eb368dee6a3cc7d1f90a6
push id11529
push userarchaeopteryx@coole-files.de
push dateThu, 04 Jul 2019 15:22:33 +0000
treeherdermozilla-beta@ebb510a784b8 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersleplatrem, erahm
bugs1083971
milestone69.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1083971 - Add an option to output a binary file for the PSL data r=leplatrem,erahm Differential Revision: https://phabricator.services.mozilla.com/D34364
netwerk/dns/prepare_tlds.py
xpcom/ds/tools/make_dafsa.py
--- a/netwerk/dns/prepare_tlds.py
+++ b/netwerk/dns/prepare_tlds.py
@@ -3,17 +3,17 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import codecs
 import encodings.idna
 import imp
 import os
 import re
 import sys
-from make_dafsa import words_to_cxx
+from make_dafsa import words_to_cxx, words_to_bin
 
 """
 Processes a file containing effective TLD data.  See the following URL for a
 description of effective TLDs and of the file format that this script
 processes (although for the latter you're better off just reading this file's
 short source code).
 
 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
@@ -93,21 +93,22 @@ class EffectiveTLDEntry:
     "True if this entry represents a class of effective TLDs."
     return self._wild
 
 
 #################
 # DO EVERYTHING #
 #################
 
-def main(output, effective_tld_filename):
+def main(output, effective_tld_filename, output_format="cxx"):
   """
   effective_tld_filename is the effective TLD file to parse.
-  A C++ array of a binary representation of a DAFSA representing the
-  eTLD file is then printed to output.
+  based on the output format, either a C++ array of a binary representation
+  of a DAFSA representing the eTLD file is then printed to standard output
+  or a binary file is written to disk.
   """
 
   def typeEnum(etld):
     """
     Maps the flags to the DAFSA's enum types.
     """
     if etld.exception():
       return 1
@@ -118,12 +119,31 @@ def main(output, effective_tld_filename)
 
   def dafsa_words():
     """
     make_dafsa expects lines of the form "<domain_name><enum_value>"
     """
     for etld in getEffectiveTLDs(effective_tld_filename):
       yield "%s%d" % (etld.domain(), typeEnum(etld))
 
-  output.write(words_to_cxx(dafsa_words()))
+  """ words_to_bin() returns a bytes while words_to_cxx() returns string """
+  if output_format == "bin":
+    if sys.version_info[0] >= 3:
+      output = output.buffer
+    output.write(words_to_bin(dafsa_words()))
+  else:
+    output.write(words_to_cxx(dafsa_words()))
+
+
 
 if __name__ == '__main__':
-    main(sys.stdout, sys.argv[1])
+    """
+    This program can output the DAFSA in two formats:
+    as C++ code that will be included and compiled at build time 
+    or as a binary file that will be published in Remote Settings.
+    
+    Flags for format options:
+    "cxx" -> C++ array [default]
+    "bin" -> Binary file
+    """
+
+    output_format = "bin" if "--bin" in sys.argv else "cxx"
+    main(sys.stdout, sys.argv[1], output_format=output_format)
--- a/xpcom/ds/tools/make_dafsa.py
+++ b/xpcom/ds/tools/make_dafsa.py
@@ -188,16 +188,17 @@ The bytes in the generated array has the
  7: 0x81 <return_value> 0x81 & 0x0F -> return 1
 
  8: 0x62 <char>         label character 0x62 -> match "b"
  9: 0x62 <char>         label character 0x62 -> match "b"
 10: 0x82 <return_value> 0x82 & 0x0F -> return 2
 """
 
 import sys
+import struct
 
 
 class InputError(Exception):
     """Exception raised for errors in the input file."""
 
 
 def to_dafsa(words):
     """Generates a DAFSA from a word list and returns the source node.
@@ -377,17 +378,17 @@ def encode_links(children, offsets, curr
     buf.reverse()
     return buf
 
 
 def encode_prefix(label):
     """Encodes a node label as a list of bytes without a trailing high byte.
 
     This method encodes a node if there is exactly one child  and the
-    child follows immidiately after so that no jump is needed. This label
+    child follows immediately after so that no jump is needed. This label
     will then be a prefix to the label in the child node.
     """
     assert label
     return [ord(c) for c in reversed(label)]
 
 
 def encode_label(label):
     """Encodes a node label as a list of bytes with a trailing high byte >0x80.
@@ -411,16 +412,23 @@ def encode(dafsa):
             output.extend(encode_links(node[1], offsets, len(output)))
             output.extend(encode_label(node[0]))
         offsets[id(node)] = len(output)
 
     output.extend(encode_links(dafsa, offsets, len(output)))
     output.reverse()
     return output
 
+def encode_words(words):
+    """Generates a dafsa representation of a word list"""
+    dafsa = to_dafsa(words)
+    for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
+        dafsa = fun(dafsa)
+    return dafsa
+
 
 def to_cxx(data, preamble=None):
     """Generates C++ code from a list of encoded bytes."""
     text = '/* This file is generated. DO NOT EDIT!\n\n'
     text += 'The byte array encodes a dictionary of strings and values. See '
     text += 'make_dafsa.py for documentation.'
     text += '*/\n\n'
 
@@ -434,22 +442,27 @@ def to_cxx(data, preamble=None):
         text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
         text += ',\n'
     text += '};\n'
     return text
 
 
 def words_to_cxx(words, preamble=None):
     """Generates C++ code from a word list"""
-    dafsa = to_dafsa(words)
-    for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
-        dafsa = fun(dafsa)
+    dafsa = encode_words(words)
     return to_cxx(encode(dafsa), preamble)
 
 
+def words_to_bin(words):
+    """Generates bytes from a word list"""
+    dafsa = encode_words(words)
+    data = encode(dafsa)
+    return struct.pack('%dB' % len(data), *data)
+
+
 def parse_gperf(infile):
     """Parses gperf file and extract strings and return code"""
     lines = [line.strip() for line in infile]
 
     # Extract the preamble.
     first_delimeter = lines.index('%%')
     preamble = '\n'.join(lines[0:first_delimeter])