Package translate :: Package storage :: Module statsdb
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.statsdb

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  from UserDict import UserDict 
 22   
 23  """Module to provide a cache of statistics in a database. 
 24   
 25  @organization: Zuza Software Foundation 
 26  @copyright: 2007 Zuza Software Foundation 
 27  @license: U{GPL <http://www.fsf.org/licensing/licenses/gpl.html>} 
 28  """ 
 29   
 30  from translate import __version__ as toolkitversion 
 31  from translate.storage import factory, base 
 32  from translate.misc.multistring import multistring 
 33  from translate.lang.common import Common 
 34   
 35  try: 
 36      from sqlite3 import dbapi2 
 37  except ImportError: 
 38      from pysqlite2 import dbapi2 
 39  import os.path 
 40  import re 
 41  import sys 
 42  import stat 
 43   
 44  kdepluralre = re.compile("^_n: ") 
 45  brtagre = re.compile("<br\s*?/?>") 
 46  xmltagre = re.compile("<[^>]+>") 
 47  numberre = re.compile("\\D\\.\\D") 
 48   
 49  state_strings = {0: "untranslated", 1: "translated", 2: "fuzzy"} 
 50   
51 -def wordcount(string):
52 # TODO: po class should understand KDE style plurals 53 string = kdepluralre.sub("", string) 54 string = brtagre.sub("\n", string) 55 string = xmltagre.sub("", string) 56 string = numberre.sub(" ", string) 57 #TODO: This should still use the correct language to count in the target 58 #language 59 return len(Common.words(string))
60
61 -def wordsinunit(unit):
62 """Counts the words in the unit's source and target, taking plurals into 63 account. The target words are only counted if the unit is translated.""" 64 (sourcewords, targetwords) = (0, 0) 65 if isinstance(unit.source, multistring): 66 sourcestrings = unit.source.strings 67 else: 68 sourcestrings = [unit.source or ""] 69 for s in sourcestrings: 70 sourcewords += wordcount(s) 71 if not unit.istranslated(): 72 return sourcewords, targetwords 73 if isinstance(unit.target, multistring): 74 targetstrings = unit.target.strings 75 else: 76 targetstrings = [unit.target or ""] 77 for s in targetstrings: 78 targetwords += wordcount(s) 79 return sourcewords, targetwords
80
81 -class Record(UserDict):
82 - def __init__(self, record_keys, record_values=None, compute_derived_values = lambda x: x):
83 if record_values == None: 84 record_values = (0 for _i in record_keys) 85 self.record_keys = record_keys 86 self.data = dict(zip(record_keys, record_values)) 87 self._compute_derived_values = compute_derived_values 88 self._compute_derived_values(self)
89
90 - def to_tuple(self):
91 return tuple(self[key] for key in self.record_keys)
92
93 - def __add__(self, other):
94 result = Record(self.record_keys) 95 for key in self.keys(): 96 result[key] = self[key] + other[key] 97 self._compute_derived_values(self) 98 return result
99
100 - def __sub__(self, other):
101 result = Record(self.record_keys) 102 for key in self.keys(): 103 result[key] = self[key] - other[key] 104 self._compute_derived_values(self) 105 return result
106
107 - def as_string_for_db(self):
108 return ",".join([repr(x) for x in self.to_tuple()])
109 110 UNTRANSLATED, TRANSLATED, FUZZY = 0, 1, 2
111 -def statefordb(unit):
112 """Returns the numeric database state for the unit.""" 113 if unit.istranslated(): 114 return TRANSLATED 115 if unit.isfuzzy() and unit.target: 116 return FUZZY 117 return UNTRANSLATED
118
119 -class FileTotals(object):
120 keys = ['translatedsourcewords', 121 'fuzzysourcewords', 122 'untranslatedsourcewords', 123 'translated', 124 'fuzzy', 125 'untranslated', 126 'translatedtargetwords'] 127
128 - def db_keys(self):
129 return ",".join(self.keys)
130
131 - def __init__(self, cur):
132 self.cur = cur 133 self.cur.execute(""" 134 CREATE TABLE IF NOT EXISTS filetotals( 135 fileid INTEGER PRIMARY KEY AUTOINCREMENT, 136 translatedsourcewords INTEGER NOT NULL, 137 fuzzysourcewords INTEGER NOT NULL, 138 untranslatedsourcewords INTEGER NOT NULL, 139 translated INTEGER NOT NULL, 140 fuzzy INTEGER NOT NULL, 141 untranslated INTEGER NOT NULL, 142 translatedtargetwords INTEGER NOT NULL);""")
143
144 - def new_record(cls, state_for_db=None, sourcewords=None, targetwords=None):
145 record = Record(cls.keys, compute_derived_values = cls._compute_derived_values) 146 if state_for_db is not None: 147 if state_for_db is UNTRANSLATED: 148 record['untranslated'] = 1 149 record['untranslatedsourcewords'] = sourcewords 150 if state_for_db is TRANSLATED: 151 record['translated'] = 1 152 record['translatedsourcewords'] = sourcewords 153 record['translatedtargetwords'] = targetwords 154 elif state_for_db is FUZZY: 155 record['fuzzy'] = 1 156 record['fuzzysourcewords'] = sourcewords 157 return record
158 159 new_record = classmethod(new_record) 160
161 - def _compute_derived_values(cls, record):
162 record["total"] = record["untranslated"] + \ 163 record["translated"] + \ 164 record["fuzzy"] 165 record["totalsourcewords"] = record["untranslatedsourcewords"] + \ 166 record["translatedsourcewords"] + \ 167 record["fuzzysourcewords"] 168 record["review"] = 0
169 _compute_derived_values = classmethod(_compute_derived_values) 170
171 - def __getitem__(self, fileid):
172 result = self.cur.execute(""" 173 SELECT %(keys)s 174 FROM filetotals 175 WHERE fileid=?;""" % {'keys': self.db_keys()}, (fileid,)) 176 return Record(FileTotals.keys, result.fetchone(), self._compute_derived_values)
177
178 - def __setitem__(self, fileid, record):
179 self.cur.execute(""" 180 INSERT OR REPLACE into filetotals 181 VALUES (%(fileid)d, %(vals)s); 182 """ % {'fileid': fileid, 'vals': record.as_string_for_db()})
183
184 - def __delitem__(self, fileid):
185 self.cur.execute(""" 186 DELETE FROM filetotals 187 WHERE fileid=?; 188 """, (fileid,))
189
190 -def emptyfiletotals():
191 """Returns a dictionary with all statistics initalised to 0.""" 192 return FileTotals.new_record()
193
194 -def emptyfilechecks():
195 return {}
196
197 -def emptyfilestats():
198 return {"total": [], "translated": [], "fuzzy": [], "untranslated": []}
199
200 -def emptyunitstats():
201 return {"sourcewordcount": [], "targetwordcount": []}
202 203 # We allow the caller to specify which value to return when errors_return_empty 204 # is True. We do this, since Poolte wants None to be returned when it calls 205 # get_mod_info directly, whereas we want an integer to be returned for 206 # uses of get_mod_info within this module. 207 # TODO: Get rid of empty_return when Pootle code is improved to not require 208 # this.
209 -def get_mod_info(file_path):
210 file_stat = os.stat(file_path) 211 assert not stat.S_ISDIR(file_stat.st_mode) 212 return file_stat.st_mtime, file_stat.st_size
213
214 -def suggestion_extension():
215 return os.path.extsep + 'pending'
216
217 -def suggestion_filename(filename):
218 return filename + suggestion_extension()
219
220 -class StatsCache(object):
221 """An object instantiated as a singleton for each statsfile that provides 222 access to the database cache from a pool of StatsCache objects.""" 223 _caches = {} 224 defaultfile = None 225 con = None 226 """This cache's connection""" 227 cur = None 228 """The current cursor""" 229
230 - def __new__(cls, statsfile=None):
231 def make_database(statsfile): 232 def connect(cache): 233 cache.con = dbapi2.connect(statsfile) 234 cache.cur = cache.con.cursor()
235 236 def clear_old_data(cache): 237 try: 238 cache.cur.execute("""SELECT toolkitbuild FROM files""") 239 val = cache.cur.fetchone() 240 if val is not None: 241 if val[0] < toolkitversion.build: 242 del cache 243 os.unlink(statsfile) 244 return True 245 return False 246 except dbapi2.OperationalError: 247 return False
248 249 cache = cls._caches[statsfile] = object.__new__(cls) 250 connect(cache) 251 if clear_old_data(cache): 252 connect(cache) 253 cache.create() 254 return cache 255 256 if not statsfile: 257 if not cls.defaultfile: 258 userdir = os.path.expanduser("~") 259 cachedir = None 260 if os.name == "nt": 261 cachedir = os.path.join(userdir, "Translate Toolkit") 262 else: 263 cachedir = os.path.join(userdir, ".translate_toolkit") 264 if not os.path.exists(cachedir): 265 os.mkdir(cachedir) 266 cls.defaultfile = os.path.realpath(os.path.join(cachedir, "stats.db")) 267 statsfile = cls.defaultfile 268 else: 269 statsfile = os.path.realpath(statsfile) 270 # First see if a cache for this file already exists: 271 if statsfile in cls._caches: 272 return cls._caches[statsfile] 273 # No existing cache. Let's build a new one and keep a copy 274 return make_database(statsfile) 275
276 - def create(self):
277 """Create all tables and indexes.""" 278 self.file_totals = FileTotals(self.cur) 279 280 self.cur.execute("""CREATE TABLE IF NOT EXISTS files( 281 fileid INTEGER PRIMARY KEY AUTOINCREMENT, 282 path VARCHAR NOT NULL UNIQUE, 283 st_mtime INTEGER NOT NULL, 284 st_size INTEGER NOT NULL, 285 toolkitbuild INTEGER NOT NULL);""") 286 287 self.cur.execute("""CREATE UNIQUE INDEX IF NOT EXISTS filepathindex 288 ON files (path);""") 289 290 self.cur.execute("""CREATE TABLE IF NOT EXISTS units( 291 id INTEGER PRIMARY KEY AUTOINCREMENT, 292 unitid VARCHAR NOT NULL, 293 fileid INTEGER NOT NULL, 294 unitindex INTEGER NOT NULL, 295 source VARCHAR NOT NULL, 296 target VARCHAR, 297 state INTEGER, 298 sourcewords INTEGER, 299 targetwords INTEGER);""") 300 301 self.cur.execute("""CREATE INDEX IF NOT EXISTS fileidindex 302 ON units(fileid);""") 303 304 self.cur.execute("""CREATE TABLE IF NOT EXISTS checkerconfigs( 305 configid INTEGER PRIMARY KEY AUTOINCREMENT, 306 config VARCHAR);""") 307 308 self.cur.execute("""CREATE INDEX IF NOT EXISTS configindex 309 ON checkerconfigs(config);""") 310 311 self.cur.execute("""CREATE TABLE IF NOT EXISTS uniterrors( 312 errorid INTEGER PRIMARY KEY AUTOINCREMENT, 313 unitindex INTEGER NOT NULL, 314 fileid INTEGER NOT NULL, 315 configid INTEGER NOT NULL, 316 name VARCHAR NOT NULL, 317 message VARCHAR);""") 318 319 self.cur.execute("""CREATE INDEX IF NOT EXISTS uniterrorindex 320 ON uniterrors(fileid, configid);""") 321 322 self.con.commit()
323
324 - def _getfileid(self, filename, check_mod_info=True, store=None, errors_return_empty=False):
325 """Attempt to find the fileid of the given file, if it hasn't been 326 updated since the last record update. 327 328 None is returned if either the file's record is not found, or if it is 329 not up to date. 330 331 @param filename: the filename to retrieve the id for 332 @param opt_mod_info: an optional mod_info to consider in addition 333 to the actual mod_info of the given file 334 @rtype: String or None 335 """ 336 realpath = os.path.realpath(filename) 337 self.cur.execute("""SELECT fileid, st_mtime, st_size FROM files 338 WHERE path=?;""", (realpath,)) 339 filerow = self.cur.fetchone() 340 try: 341 mod_info = get_mod_info(realpath) 342 if filerow: 343 fileid = filerow[0] 344 if not check_mod_info: 345 # Update the mod_info of the file 346 self.cur.execute("""UPDATE files 347 SET st_mtime=?, st_size=? 348 WHERE fileid=?;""", (mod_info[0], mod_info[1], fileid)) 349 return fileid 350 if (filerow[1], filerow[2]) == mod_info: 351 return fileid 352 # We can only ignore the mod_info if the row already exists: 353 assert check_mod_info 354 store = store or factory.getobject(realpath) 355 return self._cachestore(store, realpath, mod_info) 356 except (base.ParseError, IOError, OSError, AssertionError): 357 if errors_return_empty: 358 return -1 359 else: 360 raise
361
362 - def _getstoredcheckerconfig(self, checker):
363 """See if this checker configuration has been used before.""" 364 config = str(checker.config.__dict__) 365 self.cur.execute("""SELECT configid, config FROM checkerconfigs WHERE 366 config=?;""", (config,)) 367 configrow = self.cur.fetchone() 368 if not configrow or configrow[1] != config: 369 return None 370 else: 371 return configrow[0]
372
373 - def _cacheunitstats(self, units, fileid, unitindex=None, file_totals_record=FileTotals.new_record()):
374 """Cache the statistics for the supplied unit(s).""" 375 unitvalues = [] 376 for index, unit in enumerate(units): 377 if unit.istranslatable(): 378 sourcewords, targetwords = wordsinunit(unit) 379 if unitindex: 380 index = unitindex 381 # what about plurals in .source and .target? 382 unitvalues.append((unit.getid(), fileid, index, \ 383 unit.source, unit.target, \ 384 sourcewords, targetwords, \ 385 statefordb(unit))) 386 file_totals_record = file_totals_record + FileTotals.new_record(statefordb(unit), sourcewords, targetwords) 387 # XXX: executemany is non-standard 388 self.cur.executemany("""INSERT INTO units 389 (unitid, fileid, unitindex, source, target, sourcewords, targetwords, state) 390 values (?, ?, ?, ?, ?, ?, ?, ?);""", 391 unitvalues) 392 self.file_totals[fileid] = file_totals_record 393 self.con.commit() 394 if unitindex: 395 return state_strings[statefordb(units[0])] 396 return ""
397
398 - def _cachestore(self, store, realpath, mod_info):
399 """Calculates and caches the statistics of the given store 400 unconditionally.""" 401 self.cur.execute("""DELETE FROM files WHERE 402 path=?;""", (realpath,)) 403 self.cur.execute("""INSERT INTO files 404 (fileid, path, st_mtime, st_size, toolkitbuild) values (NULL, ?, ?, ?, ?);""", 405 (realpath, mod_info[0], mod_info[1], toolkitversion.build)) 406 fileid = self.cur.lastrowid 407 self.cur.execute("""DELETE FROM units WHERE 408 fileid=?""", (fileid,)) 409 self._cacheunitstats(store.units, fileid) 410 return fileid
411
412 - def filetotals(self, filename):
413 """Retrieves the statistics for the given file if possible, otherwise 414 delegates to cachestore().""" 415 fileid = None 416 if not fileid: 417 try: 418 fileid = self._getfileid(filename) 419 except ValueError, e: 420 print >> sys.stderr, str(e) 421 return {} 422 return self.file_totals[fileid]
423
424 - def _cacheunitschecks(self, units, fileid, configid, checker, unitindex=None):
425 """Helper method for cachestorechecks() and recacheunit()""" 426 # We always want to store one dummy error to know that we have actually 427 # run the checks on this file with the current checker configuration 428 dummy = (-1, fileid, configid, "noerror", "") 429 unitvalues = [dummy] 430 # if we are doing a single unit, we want to return the checknames 431 errornames = [] 432 for index, unit in enumerate(units): 433 if unit.istranslatable(): 434 # Correctly assign the unitindex 435 if unitindex: 436 index = unitindex 437 failures = checker.run_filters(unit) 438 for checkname, checkmessage in failures.iteritems(): 439 unitvalues.append((index, fileid, configid, checkname, checkmessage)) 440 errornames.append("check-" + checkname) 441 checker.setsuggestionstore(None) 442 443 if unitindex: 444 # We are only updating a single unit, so we don't want to add an 445 # extra noerror-entry 446 unitvalues.remove(dummy) 447 errornames.append("total") 448 449 # XXX: executemany is non-standard 450 self.cur.executemany("""INSERT INTO uniterrors 451 (unitindex, fileid, configid, name, message) 452 values (?, ?, ?, ?, ?);""", 453 unitvalues) 454 self.con.commit() 455 return errornames
456
457 - def cachestorechecks(self, fileid, store, checker, configid):
458 """Calculates and caches the error statistics of the given store 459 unconditionally.""" 460 # Let's purge all previous failures because they will probably just 461 # fill up the database without much use. 462 self.cur.execute("""DELETE FROM uniterrors WHERE 463 fileid=?;""", (fileid,)) 464 self._cacheunitschecks(store.units, fileid, configid, checker) 465 return fileid
466
467 - def get_unit_stats(self, fileid, unitid):
468 values = self.cur.execute(""" 469 SELECT state, sourcewords, targetwords 470 FROM units 471 WHERE fileid=? AND unitid=? 472 """, (fileid, unitid)) 473 return values.fetchone()
474
475 - def recacheunit(self, filename, checker, unit):
476 """Recalculate all information for a specific unit. This is necessary 477 for updating all statistics when a translation of a unit took place, 478 for example. 479 480 This method assumes that everything was up to date before (file totals, 481 checks, checker config, etc.""" 482 fileid = self._getfileid(filename, check_mod_info=False) 483 configid = self._getstoredcheckerconfig(checker) 484 unitid = unit.getid() 485 # get the unit index 486 totals_without_unit = self.file_totals[fileid] - \ 487 FileTotals.new_record(*self.get_unit_stats(fileid, unitid)) 488 self.cur.execute("""SELECT unitindex FROM units WHERE 489 fileid=? AND unitid=?;""", (fileid, unitid)) 490 unitindex = self.cur.fetchone()[0] 491 self.cur.execute("""DELETE FROM units WHERE 492 fileid=? AND unitid=?;""", (fileid, unitid)) 493 state = [self._cacheunitstats([unit], fileid, unitindex, totals_without_unit)] 494 # remove the current errors 495 self.cur.execute("""DELETE FROM uniterrors WHERE 496 fileid=? AND unitindex=?;""", (fileid, unitindex)) 497 if os.path.exists(suggestion_filename(filename)): 498 checker.setsuggestionstore(factory.getobject(suggestion_filename(filename), ignore=suggestion_extension())) 499 state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex)) 500 return state
501
502 - def _checkerrors(self, filename, fileid, configid, checker, store):
503 def geterrors(): 504 self.cur.execute("""SELECT 505 name, 506 unitindex 507 FROM uniterrors WHERE fileid=? and configid=? 508 ORDER BY unitindex;""", (fileid, configid)) 509 return self.cur.fetchone(), self.cur
510 511 first, cur = geterrors() 512 if first is not None: 513 return first, cur 514 515 # This could happen if we haven't done the checks before, or the 516 # file changed, or we are using a different configuration 517 store = store or factory.getobject(filename) 518 if os.path.exists(suggestion_filename(filename)): 519 checker.setsuggestionstore(factory.getobject(suggestion_filename(filename), ignore=suggestion_extension())) 520 self.cachestorechecks(fileid, store, checker, configid) 521 return geterrors() 522
523 - def _geterrors(self, filename, fileid, configid, checker, store):
524 result = [] 525 first, cur = self._checkerrors(filename, fileid, configid, checker, store) 526 result.append(first) 527 result.extend(cur.fetchall()) 528 return result
529
530 - def _get_config_id(self, fileid, checker):
531 configid = self._getstoredcheckerconfig(checker) 532 if configid: 533 return configid 534 self.cur.execute("""INSERT INTO checkerconfigs 535 (configid, config) values (NULL, ?);""", 536 (str(checker.config.__dict__),)) 537 return self.cur.lastrowid
538
539 - def filechecks(self, filename, checker, store=None):
540 """Retrieves the error statistics for the given file if possible, 541 otherwise delegates to cachestorechecks().""" 542 fileid = None 543 configid = None 544 try: 545 fileid = self._getfileid(filename, store=store) 546 configid = self._get_config_id(fileid, checker) 547 except ValueError, e: 548 print >> sys.stderr, str(e) 549 return emptyfilechecks() 550 551 values = self._geterrors(filename, fileid, configid, checker, store) 552 553 errors = emptyfilechecks() 554 for value in values: 555 if value[1] == -1: 556 continue 557 checkkey = 'check-' + value[0] #value[0] is the error name 558 if not checkkey in errors: 559 errors[checkkey] = [] 560 errors[checkkey].append(value[1]) #value[1] is the unitindex 561 562 return errors
563
564 - def file_fails_test(self, filename, checker, name):
565 fileid = self._getfileid(filename) 566 configid = self._get_config_id(fileid, checker) 567 self._checkerrors(filename, fileid, configid, checker, None) 568 self.cur.execute("""SELECT 569 name, 570 unitindex 571 FROM uniterrors 572 WHERE fileid=? and configid=? and name=?;""", (fileid, configid, name)) 573 return self.cur.fetchone() is not None
574
575 - def filestats(self, filename, checker, store=None):
576 """Return a dictionary of property names mapping sets of unit 577 indices with those properties.""" 578 stats = emptyfilestats() 579 580 stats.update(self.filechecks(filename, checker, store)) 581 fileid = self._getfileid(filename, store=store) 582 583 self.cur.execute("""SELECT 584 state, 585 unitindex 586 FROM units WHERE fileid=? 587 ORDER BY unitindex;""", (fileid,)) 588 589 values = self.cur.fetchall() 590 for value in values: 591 stats[state_strings[value[0]]].append(value[1]) 592 stats["total"].append(value[1]) 593 594 return stats
595
596 - def unitstats(self, filename, _lang=None, store=None):
597 # For now, lang and store are unused. lang will allow the user to 598 # base stats information on the given language. See the commented 599 # line containing stats.update below. 600 """Return a dictionary of property names mapping to arrays which 601 map unit indices to property values. 602 603 Please note that this is different from filestats, since filestats 604 supplies sets of unit indices with a given property, whereas this 605 method supplies arrays which map unit indices to given values.""" 606 stats = emptyunitstats() 607 608 #stats.update(self.unitchecks(filename, lang, store)) 609 fileid = self._getfileid(filename, store=store) 610 611 self.cur.execute("""SELECT 612 sourcewords, targetwords 613 FROM units WHERE fileid=? 614 ORDER BY unitindex;""", (fileid,)) 615 616 for sourcecount, targetcount in self.cur.fetchall(): 617 stats["sourcewordcount"].append(sourcecount) 618 stats["targetwordcount"].append(targetcount) 619 620 return stats
621