root/branches/add_stat_caching/ftpsync-0.1/rsyncmatch.py

Revision 545, 13.0 kB (checked in by schwa, 2 years ago)
Code contributed by Martin Wilck. (Thank you!) I might use some of
this code to add caching of stat results to ftputil.
  • Property svn:eol-style set to native
Line 
1 import re
2 import os
3 import sys
4 import loggingclass
5
6 INCLUDE = "+"
7 EXCLUDE = "-"
8 DONE    = "."
9
10 class RsyncGlob(loggingclass.LoggingClass):
11     """
12     A class that imitates rsync(1)'s way of include/exclude patterns.
13     Similar to glob()/fnmatch(), but "*" doesn't match "/" - "**" does.
14     See man rsync(1) for the exclude/include logic.
15     The GlobChain class creates filter chains like rsync's.
16
17     RsyncGlob(pattern), where <pattern> follows the rules in the rsync(1)
18     man page. In particular, "/XYZ" matches only at the "root", and "XYZ/"
19     matches only directories.
20
21     NOTE: This class uses Unix file name conventions.
22     It will be pretty simple to implement it for DOS/Windows though,
23     if someone volunteers.
24     
25     The following doctest example shows how the globbing works.
26     Note that leading "/" are stripped for files.
27     
28 >>> globs=(RsyncGlob("s*m"),RsyncGlob("s**m"),
29 ...        RsyncGlob("s\*m"),RsyncGlob("/s*m"),
30 ...        RsyncGlob("/s**m"),RsyncGlob("s\**m"))
31 >>> files=("spam","s/p/a/m","egg/spam","egg/sp/am","s*m","s*am")
32 >>> def outh():
33 ...     s="%10.10s" %""
34 ...     for g in globs:
35 ...         s=s+"%10.10s"%g.glob
36 ...     return s
37 >>> def outf(f):
38 ...     s="%10.10s" %f
39 ...     for g in globs:
40 ...         s=s+"%10.10s"%g.match(f)
41 ...     return s
42 >>> def out():
43 ...     print outh()
44 ...     for f in files:
45 ...         print outf(f)
46 >>> out()
47                  s*m      s**m      s\*m      /s*m     /s**m     s\**m
48       spam      True      True     False      True      True     False
49    s/p/a/m     False      True     False     False      True     False
50   egg/spam      True      True     False     False     False     False
51  egg/sp/am     False      True     False     False     False     False
52        s*m      True      True      True      True      True      True
53       s*am      True      True     False      True      True      True
54     """
55
56     # This is applied before escaping re metachars
57     __bksl_re = re.compile(r'(\\.)')
58
59     # These are applied after escaping re metachars
60     __star2_re = re.compile(r'(\\\*\\\*)'# "**"
61     __star_re = re.compile(r'(\\\*)')       # "*"
62     __quest_re = re.compile(r'(\\\?)')      # "?"
63     __slash_re = re.compile(r'/')           # "/"
64
65     def __handle_stars(self, s):
66         parts = self.__star2_re.split(s)
67
68         # side effect: patterns containing "**" match complete path
69         self.path_match = self.path_match or (len(parts) > 1)
70
71         # Odd elements are '**' now.
72         # Even elements must be checked for '*' and '?' 
73         res = ""
74         i = 0
75         while i < len(parts):
76             if parts[i] != "":
77                 tmp = self.__star_re.sub("[^/]*", parts[i])
78                 tmp = self.__quest_re.sub("[^/]", tmp)
79                 res = res + tmp
80             if i < len(parts) - 1:
81                 res = res + ".*"
82             i = i + 2
83         return res
84
85
86     def __init__(self, pat=""):
87        
88         self.type = None
89        
90         # patterns starting with +/-.
91         if len(pat) > 1:
92             if pat[:2] == "+ ":
93                 self.type = INCLUDE
94                 pat = pat[2:]
95             elif pat[:2] == "- ":
96                 self.type = EXCLUDE
97                 pat = pat[2:]
98
99         # patterns ending with "/" match only directories
100         if len(pat) > 0 and pat.endswith("/"):
101             self.dir_match = True
102             pat = pat[:-1]
103         else:
104             self.dir_match = False
105        
106         self.glob = pat
107
108         # patterns containing "/" match entire path,
109         self.path_match = (pat.find("/") != -1)
110
111         # patterns starting with "/" match only at root
112         if len(pat) > 0 and pat[0] == "/":
113             pat = pat[1:]
114             top_match = True
115         else:
116             top_match = False
117
118         # We transform the glob pattern into a regexp pattern now.
119         # First, handle all characters escaped with backslashes.
120         parts = self.__bksl_re.split(pat)
121        
122         i = 0
123         self.pat = ""
124
125         # Odd elements of parts are an escaped chars now.
126         # Need to look for glob patterns in even elements.
127         while i < len(parts):
128             if parts[i] != "":
129                 # escape any remaining regexp metacharacters like "."
130                 s = re.escape(parts[i])
131                 # sort out "**" and "*"
132                 s = self.__handle_stars(s)
133                 self.pat = self.pat + s
134             # Add back the escaped chars
135             if (i < len(parts) - 1):
136                 self.pat = self.pat + parts[i+1]
137             i = i+2
138
139         self.pat = self.pat + "$"
140         if top_match:
141             self.pat = "^" + self.pat
142         # Special case: '**/' matches empty string
143         elif self.pat[:4] == r".*\/":
144             self.pat = "(.*/|)" + self.pat[4:]
145
146         self.logger.debug("regexp: %s -> (%s)" % (self, self.pat))
147         self.re = re.compile(self.pat)
148
149
150     def __str__(self):
151         s = self.glob
152         if self.dir_match: s = s + "/"
153
154         if self.type:
155             t = self.type
156         else:
157             t = " "
158         if self.path_match:
159             p="p"
160         else:
161             p=" "
162         return "(%s)[%s%s]" % (s, t, p)
163
164
165     def match(self, filename):
166
167         if len(filename) > 0 and filename.endswith("/"):
168             filename = filename [:-1]
169         elif self.dir_match:
170             return False
171
172         if self.path_match:
173             ret = self.re.search(filename) is not None
174         else:
175             ret = self.re.match(os.path.basename(filename)) is not None
176
177         if ret:
178             self.logger.debug("%s matches %s" % (filename, self))
179         return ret
180
181
182 class GlobChain(loggingclass.LoggingClass):
183     """
184     A class that represents a chain of RsyncGlob filter rules.
185     Filter rules are applied in order. The recurse() function can
186     be used to filter directories recursively.
187
188     doctest example:
189
190 >>> loggingclass.init_logging(level=loggingclass.DEBUG,
191 ...     format="%(name)s[%(lineno)d]: %(message)s",
192 ...     stream=sys.stdout)
193 >>>
194 >>> ch = GlobChain()
195 >>> ch.set_log_level(loggingclass.DEBUG)
196 >>>
197 >>> ch.exclude("+ spam/", "- /*/", "+ egg/", "- */", "+ \*", "- *")
198 GlobChain[251]: added rule: (spam/)[+ ]
199 GlobChain[251]: added rule: (/*/)[-p]
200 GlobChain[251]: added rule: (egg/)[+ ]
201 GlobChain[251]: added rule: (*/)[- ]
202 GlobChain[251]: added rule: (\*)[+ ]
203 GlobChain[251]: added rule: (*)[- ]
204 >>> for x in ("spam", "spam/", "egg",
205 ...            "egg/", "*", "spam/egg",
206 ...             "spam/egg/", "spam/*/egg", "spam/egg/*"):
207 ...     xx = ch.match(x)
208 GlobChain[279]: exclude spam
209 GlobChain[279]: include spam/
210 GlobChain[279]: exclude egg
211 GlobChain[279]: exclude egg/
212 GlobChain[279]: include *
213 GlobChain[279]: exclude spam/egg
214 GlobChain[279]: include spam/egg/
215 GlobChain[279]: exclude spam/*/egg
216 GlobChain[279]: include spam/egg/*
217     """
218
219     __end_re = re.compile(r"/+$")
220
221     def __init__(self):
222        
223         self._lst = []
224  
225     def _in_ex(self, x):
226         if x == INCLUDE:
227             return "include"
228         elif x == EXCLUDE:
229             return "exclude"
230         else:
231             return None
232        
233     def add(self, type, *args):
234         """
235         add(type, *args): add (a) new INCLUDE/EXCLUDE pattern rule(s).
236         <type> is either INCLUDE or EXCLUDE, or the patterns must
237         start with "+" or "-".
238         +/- start character has precedence over <type>.
239         """
240         for x in args:
241             # "!" resets the rule chain (why?)
242             if (x == "!"):
243                 l = []
244             else:   
245                 glb = RsyncGlob(x)
246                 if (glb.type == None):
247                     if (type == None):
248                         raise ValueError, "filter type is undefined for %s" % glb
249                     else:
250                         glb.type = type
251                 self.logger.info("added rule: %s" % glb)
252                 self._lst.append(glb)
253
254     def exclude(self, *args):
255         """
256         exclude(*args): add (a) new EXCLUDE pattern rule(s)
257         """
258         self.add(EXCLUDE, *args)
259
260     def include(self, *args):
261         """
262         include(*args): add (a) new INCLUDE pattern rule(s)
263         """
264         self.add(INCLUDE, *args)
265
266     def match(self, path):
267         """match(path): returns the result of the current filter chain for path.
268         The rule chain is traversed until the first rule matches.
269         If path is a directory, it should end in "/".
270         """
271
272         # Default is always INCLUDE
273         ret = INCLUDE
274         for glb in self._lst:
275             if glb.match(path):
276                 ret = glb.type
277                 break
278
279         self.logger.debug("%s %s" % (self._in_ex(ret), path))
280         return ret
281
282     def _recurse(self, top, dir, collector, *args):
283    
284         for f in os.listdir(os.path.join(top, dir)):
285             rel = os.path.join(dir, f)
286             isdir = os.path.isdir(os.path.join(top, rel))
287
288             pat = rel
289             if (isdir):
290                 pat = pat + "/"
291
292             c = self.match(pat)
293
294             collector(c, pat, *args)
295             if c == EXCLUDE:
296                 continue
297
298             if isdir:
299                 self._recurse(top, rel, collector, *args)
300
301     def collect(self, c, x, *args):
302         """
303         Default collector function to use with recurse().
304         """
305         l = args[0]
306         if c == INCLUDE:
307             l.append(x)
308         elif c == DONE:
309             return l
310
311     def recurse(self, dir, collector = None, *args):
312         """
313         recurse(self, dir, collector = None, *args)
314         recursively descend <dir>. For each file or directory found,
315         <collector> is called with arguments (c, x, *args), where
316         <c> is the result of the test chain (or DONE when finished),
317         <x> is the current path, and <*args> is the rest of the arguments
318         of recurse().
319
320         Directories for which the chain evaluates to EXCLUDE are never entered.
321         
322         When c == DONE, <collector> should return its final result. This will
323         be the return value of recurse().
324
325         If <collector> isn't set, a default collector function is used that
326         returns a list of all files below <dir> for which c == INCLUDE.
327         """
328
329         if collector == None:
330             collector = self.collect
331             args = ([],)
332
333         dir = self.__end_re.sub("", dir)
334         if not os.path.isdir(dir):
335             raise IOError, "%s is not a directory" % dir
336
337         self._recurse(dir, "", collector, *args)
338         ret = collector(DONE, None, *args)
339         return ret
340
341     def add_file(self, type, name):
342         """
343         add_file(type, name):
344         Add a list of INCLUDE/EXCLUDE filter rules from a file.
345         Used to implement --exclude-from, --include-from.
346         """
347         try:
348             if name == "-":
349                 f = sys.stdin
350             else:
351                 f = open(name, "r")
352
353             for line in f.readlines():
354                 if line.endswith("\n"):
355                     line = line[:-1]
356                 self.add(type, line)
357         finally:
358             if name != "-":
359                 f.close()
360
361     _known_options = ("exclude=", "include=",
362                       "exclude-from=", "include-from=")
363
364     def options(self):
365         return self._known_options
366    
367     def getopt(self, options):
368         """
369         getopt(options): parse getopt-style option pairs for filter rules.
370         Parses all options "--exclude=", "--include=", "--exclude-from=",
371         "--include-from=", leaving other options untouched.
372         See rsync(1) for the option semantics.
373         """
374         hits = []
375         for i in range(0, len(options)):
376             (name, val) = options[i]
377             hit = True
378             if name == "--exclude":
379                 self.exclude(val)
380             elif name == "--include":   
381                 self.include(val)
382             elif name == "--exclude-from":
383                 self.add_file(EXCLUDE, val)
384             elif name == "--include-from":
385                 self.add_file(INCLUDE, val)
386             else:
387                 hit = False
388             if hit:
389                 hits.append(i)
390
391         hits.reverse()
392         for i in hits:
393             del options[i]
394            
395
396 def print_matches():
397     """
398     Usage: rsyncmatch.py [--debug] [filter rules...] directory ...
399
400     Print list of files below <directory> that match the given rules.
401     Rules are specified with "--exclude=", "--include=", "--exclude-from=",
402     "--include-from=", see rsync(1) for rule semantics.
403     """
404
405     import getopt
406     import logging
407
408     loggingclass.init_logging()
409
410    
411     gl = GlobChain()
412     (options, args) = getopt.gnu_getopt(sys.argv[1:], "",
413                                         (gl._known_options) + ("debug",))
414     gl.getopt(options)
415     for x in options:
416         if x[0] == "--debug":
417             GlobChain().set_log_level(loggingclass.DEBUG)
418             RsyncGlob().set_log_level(loggingclass.DEBUG)
419
420     for dir in args:
421         ls = gl.recurse(dir)
422    
423         print "List of include files below %s:" % dir
424         for x in ls:
425             print "   " + x
426
427
428 def _test():
429     import doctest, rsyncmatch
430     doctest.testmod(rsyncmatch)
431
432 if __name__ == "__main__":
433     if sys.argv[1] == "--test":
434         sys.argv = sys.argv[1:]
435         _test()
436     else:   
437         print_matches()
Note: See TracBrowser for help on using the browser.