| 1 |
import re |
|---|
| 2 |
import os |
|---|
| 3 |
import sys |
|---|
| 4 |
import loggingclass |
|---|
| 5 |
|
|---|
| 6 |
INCLUDE = "+" |
|---|
| 7 |
EXCLUDE = "-" |
|---|
| 8 |
DONE = "." |
|---|
| 9 |
|
|---|
| 10 |
class RsyncGlob(loggingclass.LoggingClass): |
|---|
| 11 |
""" |
|---|
| 12 |
A class that imitates rsync(1)'s way of include/exclude patterns. |
|---|
| 13 |
Similar to glob()/fnmatch(), but "*" doesn't match "/" - "**" does. |
|---|
| 14 |
See man rsync(1) for the exclude/include logic. |
|---|
| 15 |
The GlobChain class creates filter chains like rsync's. |
|---|
| 16 |
|
|---|
| 17 |
RsyncGlob(pattern), where <pattern> follows the rules in the rsync(1) |
|---|
| 18 |
man page. In particular, "/XYZ" matches only at the "root", and "XYZ/" |
|---|
| 19 |
matches only directories. |
|---|
| 20 |
|
|---|
| 21 |
NOTE: This class uses Unix file name conventions. |
|---|
| 22 |
It will be pretty simple to implement it for DOS/Windows though, |
|---|
| 23 |
if someone volunteers. |
|---|
| 24 |
|
|---|
| 25 |
The following doctest example shows how the globbing works. |
|---|
| 26 |
Note that leading "/" are stripped for files. |
|---|
| 27 |
|
|---|
| 28 |
>>> globs=(RsyncGlob("s*m"),RsyncGlob("s**m"), |
|---|
| 29 |
... RsyncGlob("s\*m"),RsyncGlob("/s*m"), |
|---|
| 30 |
... RsyncGlob("/s**m"),RsyncGlob("s\**m")) |
|---|
| 31 |
>>> files=("spam","s/p/a/m","egg/spam","egg/sp/am","s*m","s*am") |
|---|
| 32 |
>>> def outh(): |
|---|
| 33 |
... s="%10.10s" %"" |
|---|
| 34 |
... for g in globs: |
|---|
| 35 |
... s=s+"%10.10s"%g.glob |
|---|
| 36 |
... return s |
|---|
| 37 |
>>> def outf(f): |
|---|
| 38 |
... s="%10.10s" %f |
|---|
| 39 |
... for g in globs: |
|---|
| 40 |
... s=s+"%10.10s"%g.match(f) |
|---|
| 41 |
... return s |
|---|
| 42 |
>>> def out(): |
|---|
| 43 |
... print outh() |
|---|
| 44 |
... for f in files: |
|---|
| 45 |
... print outf(f) |
|---|
| 46 |
>>> out() |
|---|
| 47 |
s*m s**m s\*m /s*m /s**m s\**m |
|---|
| 48 |
spam True True False True True False |
|---|
| 49 |
s/p/a/m False True False False True False |
|---|
| 50 |
egg/spam True True False False False False |
|---|
| 51 |
egg/sp/am False True False False False False |
|---|
| 52 |
s*m True True True True True True |
|---|
| 53 |
s*am True True False True True True |
|---|
| 54 |
""" |
|---|
| 55 |
|
|---|
| 56 |
|
|---|
| 57 |
__bksl_re = re.compile(r'(\\.)') |
|---|
| 58 |
|
|---|
| 59 |
|
|---|
| 60 |
__star2_re = re.compile(r'(\\\*\\\*)') |
|---|
| 61 |
__star_re = re.compile(r'(\\\*)') |
|---|
| 62 |
__quest_re = re.compile(r'(\\\?)') |
|---|
| 63 |
__slash_re = re.compile(r'/') |
|---|
| 64 |
|
|---|
| 65 |
def __handle_stars(self, s): |
|---|
| 66 |
parts = self.__star2_re.split(s) |
|---|
| 67 |
|
|---|
| 68 |
|
|---|
| 69 |
self.path_match = self.path_match or (len(parts) > 1) |
|---|
| 70 |
|
|---|
| 71 |
|
|---|
| 72 |
|
|---|
| 73 |
res = "" |
|---|
| 74 |
i = 0 |
|---|
| 75 |
while i < len(parts): |
|---|
| 76 |
if parts[i] != "": |
|---|
| 77 |
tmp = self.__star_re.sub("[^/]*", parts[i]) |
|---|
| 78 |
tmp = self.__quest_re.sub("[^/]", tmp) |
|---|
| 79 |
res = res + tmp |
|---|
| 80 |
if i < len(parts) - 1: |
|---|
| 81 |
res = res + ".*" |
|---|
| 82 |
i = i + 2 |
|---|
| 83 |
return res |
|---|
| 84 |
|
|---|
| 85 |
|
|---|
| 86 |
def __init__(self, pat=""): |
|---|
| 87 |
|
|---|
| 88 |
self.type = None |
|---|
| 89 |
|
|---|
| 90 |
|
|---|
| 91 |
if len(pat) > 1: |
|---|
| 92 |
if pat[:2] == "+ ": |
|---|
| 93 |
self.type = INCLUDE |
|---|
| 94 |
pat = pat[2:] |
|---|
| 95 |
elif pat[:2] == "- ": |
|---|
| 96 |
self.type = EXCLUDE |
|---|
| 97 |
pat = pat[2:] |
|---|
| 98 |
|
|---|
| 99 |
|
|---|
| 100 |
if len(pat) > 0 and pat.endswith("/"): |
|---|
| 101 |
self.dir_match = True |
|---|
| 102 |
pat = pat[:-1] |
|---|
| 103 |
else: |
|---|
| 104 |
self.dir_match = False |
|---|
| 105 |
|
|---|
| 106 |
self.glob = pat |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
self.path_match = (pat.find("/") != -1) |
|---|
| 110 |
|
|---|
| 111 |
|
|---|
| 112 |
if len(pat) > 0 and pat[0] == "/": |
|---|
| 113 |
pat = pat[1:] |
|---|
| 114 |
top_match = True |
|---|
| 115 |
else: |
|---|
| 116 |
top_match = False |
|---|
| 117 |
|
|---|
| 118 |
|
|---|
| 119 |
|
|---|
| 120 |
parts = self.__bksl_re.split(pat) |
|---|
| 121 |
|
|---|
| 122 |
i = 0 |
|---|
| 123 |
self.pat = "" |
|---|
| 124 |
|
|---|
| 125 |
|
|---|
| 126 |
|
|---|
| 127 |
while i < len(parts): |
|---|
| 128 |
if parts[i] != "": |
|---|
| 129 |
|
|---|
| 130 |
s = re.escape(parts[i]) |
|---|
| 131 |
|
|---|
| 132 |
s = self.__handle_stars(s) |
|---|
| 133 |
self.pat = self.pat + s |
|---|
| 134 |
|
|---|
| 135 |
if (i < len(parts) - 1): |
|---|
| 136 |
self.pat = self.pat + parts[i+1] |
|---|
| 137 |
i = i+2 |
|---|
| 138 |
|
|---|
| 139 |
self.pat = self.pat + "$" |
|---|
| 140 |
if top_match: |
|---|
| 141 |
self.pat = "^" + self.pat |
|---|
| 142 |
|
|---|
| 143 |
elif self.pat[:4] == r".*\/": |
|---|
| 144 |
self.pat = "(.*/|)" + self.pat[4:] |
|---|
| 145 |
|
|---|
| 146 |
self.logger.debug("regexp: %s -> (%s)" % (self, self.pat)) |
|---|
| 147 |
self.re = re.compile(self.pat) |
|---|
| 148 |
|
|---|
| 149 |
|
|---|
| 150 |
def __str__(self): |
|---|
| 151 |
s = self.glob |
|---|
| 152 |
if self.dir_match: s = s + "/" |
|---|
| 153 |
|
|---|
| 154 |
if self.type: |
|---|
| 155 |
t = self.type |
|---|
| 156 |
else: |
|---|
| 157 |
t = " " |
|---|
| 158 |
if self.path_match: |
|---|
| 159 |
p="p" |
|---|
| 160 |
else: |
|---|
| 161 |
p=" " |
|---|
| 162 |
return "(%s)[%s%s]" % (s, t, p) |
|---|
| 163 |
|
|---|
| 164 |
|
|---|
| 165 |
def match(self, filename): |
|---|
| 166 |
|
|---|
| 167 |
if len(filename) > 0 and filename.endswith("/"): |
|---|
| 168 |
filename = filename [:-1] |
|---|
| 169 |
elif self.dir_match: |
|---|
| 170 |
return False |
|---|
| 171 |
|
|---|
| 172 |
if self.path_match: |
|---|
| 173 |
ret = self.re.search(filename) is not None |
|---|
| 174 |
else: |
|---|
| 175 |
ret = self.re.match(os.path.basename(filename)) is not None |
|---|
| 176 |
|
|---|
| 177 |
if ret: |
|---|
| 178 |
self.logger.debug("%s matches %s" % (filename, self)) |
|---|
| 179 |
return ret |
|---|
| 180 |
|
|---|
| 181 |
|
|---|
| 182 |
class GlobChain(loggingclass.LoggingClass): |
|---|
| 183 |
""" |
|---|
| 184 |
A class that represents a chain of RsyncGlob filter rules. |
|---|
| 185 |
Filter rules are applied in order. The recurse() function can |
|---|
| 186 |
be used to filter directories recursively. |
|---|
| 187 |
|
|---|
| 188 |
doctest example: |
|---|
| 189 |
|
|---|
| 190 |
>>> loggingclass.init_logging(level=loggingclass.DEBUG, |
|---|
| 191 |
... format="%(name)s[%(lineno)d]: %(message)s", |
|---|
| 192 |
... stream=sys.stdout) |
|---|
| 193 |
>>> |
|---|
| 194 |
>>> ch = GlobChain() |
|---|
| 195 |
>>> ch.set_log_level(loggingclass.DEBUG) |
|---|
| 196 |
>>> |
|---|
| 197 |
>>> ch.exclude("+ spam/", "- /*/", "+ egg/", "- */", "+ \*", "- *") |
|---|
| 198 |
GlobChain[251]: added rule: (spam/)[+ ] |
|---|
| 199 |
GlobChain[251]: added rule: (/*/)[-p] |
|---|
| 200 |
GlobChain[251]: added rule: (egg/)[+ ] |
|---|
| 201 |
GlobChain[251]: added rule: (*/)[- ] |
|---|
| 202 |
GlobChain[251]: added rule: (\*)[+ ] |
|---|
| 203 |
GlobChain[251]: added rule: (*)[- ] |
|---|
| 204 |
>>> for x in ("spam", "spam/", "egg", |
|---|
| 205 |
... "egg/", "*", "spam/egg", |
|---|
| 206 |
... "spam/egg/", "spam/*/egg", "spam/egg/*"): |
|---|
| 207 |
... xx = ch.match(x) |
|---|
| 208 |
GlobChain[279]: exclude spam |
|---|
| 209 |
GlobChain[279]: include spam/ |
|---|
| 210 |
GlobChain[279]: exclude egg |
|---|
| 211 |
GlobChain[279]: exclude egg/ |
|---|
| 212 |
GlobChain[279]: include * |
|---|
| 213 |
GlobChain[279]: exclude spam/egg |
|---|
| 214 |
GlobChain[279]: include spam/egg/ |
|---|
| 215 |
GlobChain[279]: exclude spam/*/egg |
|---|
| 216 |
GlobChain[279]: include spam/egg/* |
|---|
| 217 |
""" |
|---|
| 218 |
|
|---|
| 219 |
__end_re = re.compile(r"/+$") |
|---|
| 220 |
|
|---|
| 221 |
def __init__(self): |
|---|
| 222 |
|
|---|
| 223 |
self._lst = [] |
|---|
| 224 |
|
|---|
| 225 |
def _in_ex(self, x): |
|---|
| 226 |
if x == INCLUDE: |
|---|
| 227 |
return "include" |
|---|
| 228 |
elif x == EXCLUDE: |
|---|
| 229 |
return "exclude" |
|---|
| 230 |
else: |
|---|
| 231 |
return None |
|---|
| 232 |
|
|---|
| 233 |
def add(self, type, *args): |
|---|
| 234 |
""" |
|---|
| 235 |
add(type, *args): add (a) new INCLUDE/EXCLUDE pattern rule(s). |
|---|
| 236 |
<type> is either INCLUDE or EXCLUDE, or the patterns must |
|---|
| 237 |
start with "+" or "-". |
|---|
| 238 |
+/- start character has precedence over <type>. |
|---|
| 239 |
""" |
|---|
| 240 |
for x in args: |
|---|
| 241 |
|
|---|
| 242 |
if (x == "!"): |
|---|
| 243 |
l = [] |
|---|
| 244 |
else: |
|---|
| 245 |
glb = RsyncGlob(x) |
|---|
| 246 |
if (glb.type == None): |
|---|
| 247 |
if (type == None): |
|---|
| 248 |
raise ValueError, "filter type is undefined for %s" % glb |
|---|
| 249 |
else: |
|---|
| 250 |
glb.type = type |
|---|
| 251 |
self.logger.info("added rule: %s" % glb) |
|---|
| 252 |
self._lst.append(glb) |
|---|
| 253 |
|
|---|
| 254 |
def exclude(self, *args): |
|---|
| 255 |
""" |
|---|
| 256 |
exclude(*args): add (a) new EXCLUDE pattern rule(s) |
|---|
| 257 |
""" |
|---|
| 258 |
self.add(EXCLUDE, *args) |
|---|
| 259 |
|
|---|
| 260 |
def include(self, *args): |
|---|
| 261 |
""" |
|---|
| 262 |
include(*args): add (a) new INCLUDE pattern rule(s) |
|---|
| 263 |
""" |
|---|
| 264 |
self.add(INCLUDE, *args) |
|---|
| 265 |
|
|---|
| 266 |
def match(self, path): |
|---|
| 267 |
"""match(path): returns the result of the current filter chain for path. |
|---|
| 268 |
The rule chain is traversed until the first rule matches. |
|---|
| 269 |
If path is a directory, it should end in "/". |
|---|
| 270 |
""" |
|---|
| 271 |
|
|---|
| 272 |
|
|---|
| 273 |
ret = INCLUDE |
|---|
| 274 |
for glb in self._lst: |
|---|
| 275 |
if glb.match(path): |
|---|
| 276 |
ret = glb.type |
|---|
| 277 |
break |
|---|
| 278 |
|
|---|
| 279 |
self.logger.debug("%s %s" % (self._in_ex(ret), path)) |
|---|
| 280 |
return ret |
|---|
| 281 |
|
|---|
| 282 |
def _recurse(self, top, dir, collector, *args): |
|---|
| 283 |
|
|---|
| 284 |
for f in os.listdir(os.path.join(top, dir)): |
|---|
| 285 |
rel = os.path.join(dir, f) |
|---|
| 286 |
isdir = os.path.isdir(os.path.join(top, rel)) |
|---|
| 287 |
|
|---|
| 288 |
pat = rel |
|---|
| 289 |
if (isdir): |
|---|
| 290 |
pat = pat + "/" |
|---|
| 291 |
|
|---|
| 292 |
c = self.match(pat) |
|---|
| 293 |
|
|---|
| 294 |
collector(c, pat, *args) |
|---|
| 295 |
if c == EXCLUDE: |
|---|
| 296 |
continue |
|---|
| 297 |
|
|---|
| 298 |
if isdir: |
|---|
| 299 |
self._recurse(top, rel, collector, *args) |
|---|
| 300 |
|
|---|
| 301 |
def collect(self, c, x, *args): |
|---|
| 302 |
""" |
|---|
| 303 |
Default collector function to use with recurse(). |
|---|
| 304 |
""" |
|---|
| 305 |
l = args[0] |
|---|
| 306 |
if c == INCLUDE: |
|---|
| 307 |
l.append(x) |
|---|
| 308 |
elif c == DONE: |
|---|
| 309 |
return l |
|---|
| 310 |
|
|---|
| 311 |
def recurse(self, dir, collector = None, *args): |
|---|
| 312 |
""" |
|---|
| 313 |
recurse(self, dir, collector = None, *args) |
|---|
| 314 |
recursively descend <dir>. For each file or directory found, |
|---|
| 315 |
<collector> is called with arguments (c, x, *args), where |
|---|
| 316 |
<c> is the result of the test chain (or DONE when finished), |
|---|
| 317 |
<x> is the current path, and <*args> is the rest of the arguments |
|---|
| 318 |
of recurse(). |
|---|
| 319 |
|
|---|
| 320 |
Directories for which the chain evaluates to EXCLUDE are never entered. |
|---|
| 321 |
|
|---|
| 322 |
When c == DONE, <collector> should return its final result. This will |
|---|
| 323 |
be the return value of recurse(). |
|---|
| 324 |
|
|---|
| 325 |
If <collector> isn't set, a default collector function is used that |
|---|
| 326 |
returns a list of all files below <dir> for which c == INCLUDE. |
|---|
| 327 |
""" |
|---|
| 328 |
|
|---|
| 329 |
if collector == None: |
|---|
| 330 |
collector = self.collect |
|---|
| 331 |
args = ([],) |
|---|
| 332 |
|
|---|
| 333 |
dir = self.__end_re.sub("", dir) |
|---|
| 334 |
if not os.path.isdir(dir): |
|---|
| 335 |
raise IOError, "%s is not a directory" % dir |
|---|
| 336 |
|
|---|
| 337 |
self._recurse(dir, "", collector, *args) |
|---|
| 338 |
ret = collector(DONE, None, *args) |
|---|
| 339 |
return ret |
|---|
| 340 |
|
|---|
| 341 |
def add_file(self, type, name): |
|---|
| 342 |
""" |
|---|
| 343 |
add_file(type, name): |
|---|
| 344 |
Add a list of INCLUDE/EXCLUDE filter rules from a file. |
|---|
| 345 |
Used to implement --exclude-from, --include-from. |
|---|
| 346 |
""" |
|---|
| 347 |
try: |
|---|
| 348 |
if name == "-": |
|---|
| 349 |
f = sys.stdin |
|---|
| 350 |
else: |
|---|
| 351 |
f = open(name, "r") |
|---|
| 352 |
|
|---|
| 353 |
for line in f.readlines(): |
|---|
| 354 |
if line.endswith("\n"): |
|---|
| 355 |
line = line[:-1] |
|---|
| 356 |
self.add(type, line) |
|---|
| 357 |
finally: |
|---|
| 358 |
if name != "-": |
|---|
| 359 |
f.close() |
|---|
| 360 |
|
|---|
| 361 |
_known_options = ("exclude=", "include=", |
|---|
| 362 |
"exclude-from=", "include-from=") |
|---|
| 363 |
|
|---|
| 364 |
def options(self): |
|---|
| 365 |
return self._known_options |
|---|
| 366 |
|
|---|
| 367 |
def getopt(self, options): |
|---|
| 368 |
""" |
|---|
| 369 |
getopt(options): parse getopt-style option pairs for filter rules. |
|---|
| 370 |
Parses all options "--exclude=", "--include=", "--exclude-from=", |
|---|
| 371 |
"--include-from=", leaving other options untouched. |
|---|
| 372 |
See rsync(1) for the option semantics. |
|---|
| 373 |
""" |
|---|
| 374 |
hits = [] |
|---|
| 375 |
for i in range(0, len(options)): |
|---|
| 376 |
(name, val) = options[i] |
|---|
| 377 |
hit = True |
|---|
| 378 |
if name == "--exclude": |
|---|
| 379 |
self.exclude(val) |
|---|
| 380 |
elif name == "--include": |
|---|
| 381 |
self.include(val) |
|---|
| 382 |
elif name == "--exclude-from": |
|---|
| 383 |
self.add_file(EXCLUDE, val) |
|---|
| 384 |
elif name == "--include-from": |
|---|
| 385 |
self.add_file(INCLUDE, val) |
|---|
| 386 |
else: |
|---|
| 387 |
hit = False |
|---|
| 388 |
if hit: |
|---|
| 389 |
hits.append(i) |
|---|
| 390 |
|
|---|
| 391 |
hits.reverse() |
|---|
| 392 |
for i in hits: |
|---|
| 393 |
del options[i] |
|---|
| 394 |
|
|---|
| 395 |
|
|---|
| 396 |
def print_matches(): |
|---|
| 397 |
""" |
|---|
| 398 |
Usage: rsyncmatch.py [--debug] [filter rules...] directory ... |
|---|
| 399 |
|
|---|
| 400 |
Print list of files below <directory> that match the given rules. |
|---|
| 401 |
Rules are specified with "--exclude=", "--include=", "--exclude-from=", |
|---|
| 402 |
"--include-from=", see rsync(1) for rule semantics. |
|---|
| 403 |
""" |
|---|
| 404 |
|
|---|
| 405 |
import getopt |
|---|
| 406 |
import logging |
|---|
| 407 |
|
|---|
| 408 |
loggingclass.init_logging() |
|---|
| 409 |
|
|---|
| 410 |
|
|---|
| 411 |
gl = GlobChain() |
|---|
| 412 |
(options, args) = getopt.gnu_getopt(sys.argv[1:], "", |
|---|
| 413 |
(gl._known_options) + ("debug",)) |
|---|
| 414 |
gl.getopt(options) |
|---|
| 415 |
for x in options: |
|---|
| 416 |
if x[0] == "--debug": |
|---|
| 417 |
GlobChain().set_log_level(loggingclass.DEBUG) |
|---|
| 418 |
RsyncGlob().set_log_level(loggingclass.DEBUG) |
|---|
| 419 |
|
|---|
| 420 |
for dir in args: |
|---|
| 421 |
ls = gl.recurse(dir) |
|---|
| 422 |
|
|---|
| 423 |
print "List of include files below %s:" % dir |
|---|
| 424 |
for x in ls: |
|---|
| 425 |
print " " + x |
|---|
| 426 |
|
|---|
| 427 |
|
|---|
| 428 |
def _test(): |
|---|
| 429 |
import doctest, rsyncmatch |
|---|
| 430 |
doctest.testmod(rsyncmatch) |
|---|
| 431 |
|
|---|
| 432 |
if __name__ == "__main__": |
|---|
| 433 |
if sys.argv[1] == "--test": |
|---|
| 434 |
sys.argv = sys.argv[1:] |
|---|
| 435 |
_test() |
|---|
| 436 |
else: |
|---|
| 437 |
print_matches() |
|---|