home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_262 / csv.pyo (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2011-09-09  |  10.6 KB  |  390 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyo (Python 2.7)
  3.  
  4. import re
  5. from functools import reduce
  6. from _csv import Error, __version__, writer, reader, register_dialect, unregister_dialect, get_dialect, list_dialects, field_size_limit, QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, __doc__
  7. from _csv import Dialect as _Dialect
  8.  
  9. try:
  10.     from cStringIO import StringIO
  11. except ImportError:
  12.     from StringIO import StringIO
  13.  
  14. __all__ = [
  15.     'QUOTE_MINIMAL',
  16.     'QUOTE_ALL',
  17.     'QUOTE_NONNUMERIC',
  18.     'QUOTE_NONE',
  19.     'Error',
  20.     'Dialect',
  21.     '__doc__',
  22.     'excel',
  23.     'excel_tab',
  24.     'field_size_limit',
  25.     'reader',
  26.     'writer',
  27.     'register_dialect',
  28.     'get_dialect',
  29.     'list_dialects',
  30.     'Sniffer',
  31.     'unregister_dialect',
  32.     '__version__',
  33.     'DictReader',
  34.     'DictWriter']
  35.  
  36. class Dialect:
  37.     _name = ''
  38.     _valid = False
  39.     delimiter = None
  40.     quotechar = None
  41.     escapechar = None
  42.     doublequote = None
  43.     skipinitialspace = None
  44.     lineterminator = None
  45.     quoting = None
  46.     
  47.     def __init__(self):
  48.         if self.__class__ != Dialect:
  49.             self._valid = True
  50.         self._validate()
  51.  
  52.     
  53.     def _validate(self):
  54.         
  55.         try:
  56.             _Dialect(self)
  57.         except TypeError:
  58.             e = None
  59.             raise Error(str(e))
  60.  
  61.  
  62.  
  63.  
  64. class excel(Dialect):
  65.     delimiter = ','
  66.     quotechar = '"'
  67.     doublequote = True
  68.     skipinitialspace = False
  69.     lineterminator = '\r\n'
  70.     quoting = QUOTE_MINIMAL
  71.  
  72. register_dialect('excel', excel)
  73.  
  74. class excel_tab(excel):
  75.     delimiter = '\t'
  76.  
  77. register_dialect('excel-tab', excel_tab)
  78.  
  79. class DictReader:
  80.     
  81.     def __init__(self, f, fieldnames = None, restkey = None, restval = None, dialect = 'excel', *args, **kwds):
  82.         self._fieldnames = fieldnames
  83.         self.restkey = restkey
  84.         self.restval = restval
  85.         self.reader = reader(f, dialect, *args, **kwds)
  86.         self.dialect = dialect
  87.         self.line_num = 0
  88.  
  89.     
  90.     def __iter__(self):
  91.         return self
  92.  
  93.     
  94.     def fieldnames(self):
  95.         if self._fieldnames is None:
  96.             
  97.             try:
  98.                 self._fieldnames = self.reader.next()
  99.             except StopIteration:
  100.                 pass
  101.             
  102.  
  103.         self.line_num = self.reader.line_num
  104.         return self._fieldnames
  105.  
  106.     fieldnames = property(fieldnames)
  107.     
  108.     def fieldnames(self, value):
  109.         self._fieldnames = value
  110.  
  111.     fieldnames = fieldnames.setter(fieldnames)
  112.     
  113.     def next(self):
  114.         if self.line_num == 0:
  115.             self.fieldnames
  116.         row = self.reader.next()
  117.         self.line_num = self.reader.line_num
  118.         while row == []:
  119.             row = self.reader.next()
  120.         d = dict(zip(self.fieldnames, row))
  121.         lf = len(self.fieldnames)
  122.         lr = len(row)
  123.         if lf < lr:
  124.             d[self.restkey] = row[lf:]
  125.         elif lf > lr:
  126.             for key in self.fieldnames[lr:]:
  127.                 d[key] = self.restval
  128.             
  129.         return d
  130.  
  131.  
  132.  
  133. class DictWriter:
  134.     
  135.     def __init__(self, f, fieldnames, restval = '', extrasaction = 'raise', dialect = 'excel', *args, **kwds):
  136.         self.fieldnames = fieldnames
  137.         self.restval = restval
  138.         if extrasaction.lower() not in ('raise', 'ignore'):
  139.             raise ValueError, "extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction
  140.         self.extrasaction = extrasaction
  141.         self.writer = writer(f, dialect, *args, **kwds)
  142.  
  143.     
  144.     def writeheader(self):
  145.         header = dict(zip(self.fieldnames, self.fieldnames))
  146.         self.writerow(header)
  147.  
  148.     
  149.     def _dict_to_list(self, rowdict):
  150.         if self.extrasaction == 'raise':
  151.             wrong_fields = [ k for k in rowdict if k not in self.fieldnames ]
  152.             if wrong_fields:
  153.                 raise ValueError('dict contains fields not in fieldnames: ' + ', '.join(wrong_fields))
  154.         return [ rowdict.get(key, self.restval) for key in self.fieldnames ]
  155.  
  156.     
  157.     def writerow(self, rowdict):
  158.         return self.writer.writerow(self._dict_to_list(rowdict))
  159.  
  160.     
  161.     def writerows(self, rowdicts):
  162.         rows = []
  163.         for rowdict in rowdicts:
  164.             rows.append(self._dict_to_list(rowdict))
  165.         
  166.         return self.writer.writerows(rows)
  167.  
  168.  
  169.  
  170. try:
  171.     complex
  172. except NameError:
  173.     complex = float
  174.  
  175.  
  176. class Sniffer:
  177.     
  178.     def __init__(self):
  179.         self.preferred = [
  180.             ',',
  181.             '\t',
  182.             ';',
  183.             ' ',
  184.             ':']
  185.  
  186.     
  187.     def sniff(self, sample, delimiters = None):
  188.         (quotechar, doublequote, delimiter, skipinitialspace) = self._guess_quote_and_delimiter(sample, delimiters)
  189.         if not delimiter:
  190.             (delimiter, skipinitialspace) = self._guess_delimiter(sample, delimiters)
  191.         if not delimiter:
  192.             raise Error, 'Could not determine delimiter'
  193.         
  194.         class dialect(Dialect):
  195.             _name = 'sniffed'
  196.             lineterminator = '\r\n'
  197.             quoting = QUOTE_MINIMAL
  198.  
  199.         dialect.doublequote = doublequote
  200.         dialect.delimiter = delimiter
  201.         if not quotechar:
  202.             pass
  203.         dialect.quotechar = '"'
  204.         dialect.skipinitialspace = skipinitialspace
  205.         return dialect
  206.  
  207.     
  208.     def _guess_quote_and_delimiter(self, data, delimiters):
  209.         matches = []
  210.         for restr in ('(?P<delim>[^\\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\\w\n"\'])(?P<space> ?)', '(?P<delim>>[^\\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):
  211.             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
  212.             matches = regexp.findall(data)
  213.             if matches:
  214.                 break
  215.                 continue
  216.         if not matches:
  217.             return ('', False, None, 0)
  218.         quotes = None
  219.         delims = { }
  220.         spaces = 0
  221.         for m in matches:
  222.             n = regexp.groupindex['quote'] - 1
  223.             key = m[n]
  224.             if key:
  225.                 quotes[key] = quotes.get(key, 0) + 1
  226.             
  227.             try:
  228.                 n = regexp.groupindex['delim'] - 1
  229.                 key = m[n]
  230.             except KeyError:
  231.                 continue
  232.  
  233.             if key:
  234.                 if delimiters is None or key in delimiters:
  235.                     delims[key] = delims.get(key, 0) + 1
  236.             
  237.             try:
  238.                 n = regexp.groupindex['space'] - 1
  239.             except KeyError:
  240.                 continue
  241.  
  242.             if m[n]:
  243.                 spaces += 1
  244.                 continue
  245.         quotechar = reduce((lambda a, b, quotes = quotes: if not quotes[a] > quotes[b] or a:
  246. passb), quotes.keys())
  247.         if delims:
  248.             delim = reduce((lambda a, b, delims = delims: if not delims[a] > delims[b] or a:
  249. passb), delims.keys())
  250.             skipinitialspace = delims[delim] == spaces
  251.             if delim == '\n':
  252.                 delim = ''
  253.             
  254.         else:
  255.             delim = ''
  256.             skipinitialspace = 0
  257.         dq_regexp = re.compile('((%(delim)s)|^)\\W*%(quote)s[^%(delim)s\\n]*%(quote)s[^%(delim)s\\n]*%(quote)s\\W*((%(delim)s)|$)' % {
  258.             'delim': delim,
  259.             'quote': quotechar }, re.MULTILINE)
  260.         if dq_regexp.search(data):
  261.             doublequote = True
  262.         else:
  263.             doublequote = False
  264.         return (quotechar, doublequote, delim, skipinitialspace)
  265.  
  266.     
  267.     def _guess_delimiter(self, data, delimiters):
  268.         data = filter(None, data.split('\n'))
  269.         ascii = [ chr(c) for c in range(127) ]
  270.         chunkLength = min(10, len(data))
  271.         iteration = 0
  272.         charFrequency = { }
  273.         modes = { }
  274.         delims = { }
  275.         start = 0
  276.         end = min(chunkLength, len(data))
  277.         while start < len(data):
  278.             iteration += 1
  279.             for line in data[start:end]:
  280.                 for char in ascii:
  281.                     metaFrequency = charFrequency.get(char, { })
  282.                     freq = line.count(char)
  283.                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
  284.                     charFrequency[char] = metaFrequency
  285.                 
  286.             
  287.             for char in charFrequency.keys():
  288.                 items = charFrequency[char].items()
  289.                 if len(items) == 1 and items[0][0] == 0:
  290.                     continue
  291.                 if len(items) > 1:
  292.                     modes[char] = reduce((lambda a, b: if not a[1] > b[1] or a:
  293. passb), items)
  294.                     items.remove(modes[char])
  295.                     modes[char] = (modes[char][0], modes[char][1] - reduce((lambda a, b: (0, a[1] + b[1])), items)[1])
  296.                     continue
  297.                 modes[char] = items[0]
  298.             
  299.             modeList = modes.items()
  300.             total = float(chunkLength * iteration)
  301.             consistency = 1
  302.             threshold = 0.9
  303.             while len(delims) == 0 and consistency >= threshold:
  304.                 for k, v in modeList:
  305.                     if v[0] > 0 and v[1] > 0 or v[1] / total >= consistency:
  306.                         if delimiters is None or k in delimiters:
  307.                             delims[k] = v
  308.                         
  309.                 
  310.                 consistency -= 0.01
  311.             if len(delims) == 1:
  312.                 delim = delims.keys()[0]
  313.                 skipinitialspace = data[0].count(delim) == data[0].count('%c ' % delim)
  314.                 return (delim, skipinitialspace)
  315.             start = None
  316.             end += chunkLength
  317.         if not delims:
  318.             return ('', 0)
  319.         if None(delims) > 1:
  320.             for d in self.preferred:
  321.                 if d in delims.keys():
  322.                     skipinitialspace = data[0].count(d) == data[0].count('%c ' % d)
  323.                     return (d, skipinitialspace)
  324.             
  325.         items = [ (v, k) for k, v in delims.items() ]
  326.         items.sort()
  327.         delim = items[-1][1]
  328.         skipinitialspace = data[0].count(delim) == data[0].count('%c ' % delim)
  329.         return (delim, skipinitialspace)
  330.  
  331.     
  332.     def has_header(self, sample):
  333.         rdr = reader(StringIO(sample), self.sniff(sample))
  334.         header = rdr.next()
  335.         columns = len(header)
  336.         columnTypes = { }
  337.         for i in range(columns):
  338.             columnTypes[i] = None
  339.         
  340.         checked = 0
  341.         for row in rdr:
  342.             if checked > 20:
  343.                 break
  344.             checked += 1
  345.             if len(row) != columns:
  346.                 continue
  347.             for col in columnTypes.keys():
  348.                 for thisType in [
  349.                     int,
  350.                     long,
  351.                     float,
  352.                     complex]:
  353.                     
  354.                     try:
  355.                         thisType(row[col])
  356.                     continue
  357.                     except (ValueError, OverflowError):
  358.                         continue
  359.                     
  360.  
  361.                 else:
  362.                     thisType = len(row[col])
  363.                 if thisType == long:
  364.                     thisType = int
  365.                 if thisType != columnTypes[col] or columnTypes[col] is None:
  366.                     columnTypes[col] = thisType
  367.                 else:
  368.                     del columnTypes[col]
  369.             
  370.         
  371.         hasHeader = 0
  372.         for col, colType in columnTypes.items():
  373.             if type(colType) == type(0):
  374.                 if len(header[col]) != colType:
  375.                     hasHeader += 1
  376.                 else:
  377.                     hasHeader -= 1
  378.             
  379.             try:
  380.                 colType(header[col])
  381.             except (ValueError, TypeError):
  382.                 hasHeader += 1
  383.                 continue
  384.  
  385.             hasHeader -= 1
  386.         
  387.         return hasHeader > 0
  388.  
  389.  
  390.