diff --git a/pycdlib/dr.py b/pycdlib/dr.py index 78b24663..768f8baa 100644 --- a/pycdlib/dr.py +++ b/pycdlib/dr.py @@ -164,7 +164,7 @@ class DirectoryRecord: 'index_in_parent', 'dr_len', 'xattr_len', 'file_flags', 'file_unit_size', 'interleave_gap_size', 'len_fi', 'isdir', 'orig_extent_loc', 'data_length', 'seqnum', 'is_root', - 'parent', 'rock_ridge', 'xa_record', 'file_ident') + 'parent', 'rock_ridge', 'xa_record', 'file_ident', '_sort_key') FILE_FLAG_EXISTENCE_BIT = 0 FILE_FLAG_DIRECTORY_BIT = 1 @@ -285,12 +285,20 @@ def parse(self, vd, record, parent, xa=False): if self.is_root: self._printable_name = '/'.encode(vd.encoding) + self._sort_key = b'\x00' elif self.file_ident == b'\x00': self._printable_name = '.'.encode(vd.encoding) + self._sort_key = b'\x00' elif self.file_ident == b'\x01': self._printable_name = '..'.encode(vd.encoding) + self._sort_key = b'\x01' else: self._printable_name = self.file_ident + # Bisect sort key for __lt__: dot/dotdot get b'\x00'/b'\x01' so + # they always sort first; real names get a 0xff prefix so they + # stay above dotdot even for Joliet, where UCS-2 names start + # with a 0x00 byte and would otherwise compare lower than b'\x01'. + self._sort_key = b'\xff' + self.file_ident if self.parent is not None: if xa: @@ -514,12 +522,16 @@ def _new(self, vd, name, parent, seqnum, isdir, length, xa, date_seconds): if self.is_root: self._printable_name = '/'.encode(vd.encoding) + self._sort_key = b'\x00' elif self.file_ident == b'\x00': self._printable_name = '.'.encode(vd.encoding) + self._sort_key = b'\x00' elif self.file_ident == b'\x01': self._printable_name = '..'.encode(vd.encoding) + self._sort_key = b'\x01' else: self._printable_name = self.file_ident + self._sort_key = b'\xff' + self.file_ident self.vd = vd @@ -1255,36 +1267,14 @@ def fp_offset(self): ############# END BACKWARDS COMPATIBILITY ################################# def __lt__(self, other): - # This method is used for the bisect.insort_left() when adding a child. - # It needs to return whether self is less than other. Here we use the - # ISO9660 sorting order which is essentially: - # - # 1. The \x00 is always the 'dot' record, and is always first. - # 2. The \x01 is always the 'dotdot' record, and is always second. - # 3. Other entries are sorted lexically; this does not exactly match - # the sorting method specified in Ecma-119, but does OK for now. - # - # Ecma-119 Section 9.3 specifies that we need to pad out the shorter of - # the two files with 0x20 (spaces), then compare byte-by-byte until - # they differ. However, we can more easily just do the string equality - # comparison, since it will always be the case that 0x20 will be less - # than any of the other allowed characters in the strings. - if self.file_ident == b'\x00': - if other.file_ident == b'\x00': - return False - return True - if other.file_ident == b'\x00': - return False - - if self.file_ident == b'\x01': - if other.file_ident == b'\x00': - return False - return True - - if other.file_ident == b'\x01': - # If self.file_ident was '\x00', it would have been caught above. - return False - return self.file_ident < other.file_ident + # Used by bisect.bisect_left() in _add_child(). Both records carry a + # precomputed _sort_key (set when file_ident is finalized) that + # encodes the ECMA-119 ordering: dot first, then dotdot, then real + # names — see the _sort_key assignments in parse() / _new(). Real + # names compare bytewise; that doesn't match the strict ECMA-119 + # 9.3 padding rule but is equivalent here, since the pad byte 0x20 + # is less than any other allowed file-identifier character. + return self._sort_key < other._sort_key def __ne__(self, other): # type: (object) -> bool diff --git a/pycdlib/pycdlib.py b/pycdlib/pycdlib.py index a902bd8a..e7b7f066 100644 --- a/pycdlib/pycdlib.py +++ b/pycdlib/pycdlib.py @@ -432,17 +432,17 @@ def _yield_children(rec, rr): continue last = fi - skip_child = False if rr: if child.rock_ridge is not None: - for inner_child in child.children: - if inner_child.is_dotdot(): - if inner_child.rock_ridge is not None and inner_child.rock_ridge.parent_link_record_exists(): - skip_child = True - break - - if skip_child: - continue + # For a non-root directory, dot sits at children[0] and dotdot + # at children[1] (file_ident sort: b'\x00' < b'\x01' < ...). + # If dotdot carries a PL record this is a relocated + # destination, which is yielded via the cl_to_moved_dr path + # elsewhere — skip it here to avoid a duplicate. + if len(child.children) >= 2: + dotdot = child.children[1] + if dotdot.rock_ridge is not None and dotdot.rock_ridge.parent_link_record_exists(): + continue if child.rock_ridge.child_link_record_exists() and \ child.rock_ridge.cl_to_moved_dr is not None and \ @@ -489,18 +489,20 @@ def _find_dr_record_by_name(vd, path, encoding): entry = root_dir_record - tmpdr = dr.DirectoryRecord() - while True: child = None thelist = entry.children + # Bisect for currpath among the real entries. Children index 0/1 are + # dot/dotdot, so we start at 2. All entries from index 2 on are real + # names that compare bytewise, matching dr.__lt__ on the real-entry + # path; comparing file_ident directly avoids materializing a scratch + # DirectoryRecord just to drive __lt__. lo = 2 hi = len(thelist) while lo < hi: mid = (lo + hi) // 2 - tmpdr.file_ident = currpath - if thelist[mid] < tmpdr: + if thelist[mid].file_ident < currpath: lo = mid + 1 else: hi = mid