#!/usr/bin/python

# $Id: check_license.py,v 1.68 2010-02-09 11:53:08 potyra Exp $
#
# Copyright (C) 2009-2010 FAUmachine Team <info@faumachine.org>.
# This program is free software. You can redistribute it and/or modify it
# under the terms of the GNU General Public License, either version 2 of
# the License, or (at your option) any later version. See COPYING.

import sys
import string
import re
import os
import struct
import fnmatch
import difflib

# number of lines scanned in source files
SOURCE_LINES=50

# don't traverse to any directories named like these
BLACKLISTED_DIRS = (
			# cvs specific
			"CVS", 
			# created by automake
			".deps"
			)
# skip these exact directories
BLACKLISTED_PATHDIRS= (
			# FIXME (for that to work really good, ideally
			# experiments whould need to be conducted in one
			# single directory, so that created files won't 
			# disturb the heuristics in there)
			"./experiments",
			# FIXME
			"./lib/keymaps"
			)
# if a file named like this appear in any directory, skip it.
BLACKLISTED_FILES = (	
			# cvs specific
			".cvsignore", 
			# generic text only file
			"README",
			# generic text only file
			"TODO"
			)
# skip checking files with the exact path
BLACKLISTED_PATHFILES = (
			# that's the destination file
			"./copyright",
			# only an informational text file
			"./AUTHORS",
			# our changelog file
			"./NEWS",
			# informational text file
			"./README.bsd",
			# informational text file
			"./README.macosx",
			# informational text file
			"./STATE",
			# GPL itself
			"./COPYING",
			# LGPL itself
			"./COPYING.LIB",
			# informational text file
			"./INSTALL",
			# informational text file
			"./doc/IO_PORTS.txt",
			# informational text file
			"./doc/CODINGSTYLE",
			# FIXME these should go away soon.
			"./doc/FAUmachineLogo/FAUmachineLogo.pdf",
			"./doc/FAUmachineLogo/FAUmachineLogo.cdr",
			# FIXME can comments be added there?
			"./node-pc/simulator/vgabios/vgabios.lds",
  			"./node-pc/simulator/vgabios/head.S",
  			"./node-pc/simulator/vgabios/entry.S",
			# special graphic formats...
			"./lib/logo2.ppm",
  			"./lib/logo.ppm",
  			"./lib/logo.epa",
			# can't easily add comments to these.
  			"./lib/pattern-matcher/ttf_input/Arial_11pt.txt",
  			"./lib/pattern-matcher/ttf_input/Arial_11pt.ppm",
			# commenting these is extremely painful
			"./doc/DesignNotes/architecture.dia",
			"./doc/DesignNotes/shadow.dia",
			"./doc/DesignNotes/simple.dia",
			"./doc/DesignNotes/socket.dia"
			)

class CopyrightHolder:
	def __init__(self, firstname, lastname, email, year1, year2):
		self._firstname = firstname
		self._lastname = lastname
		if self._lastname.endswith("."):
			self._lastname = self._lastname[:-1]

		self._email = email

		if year1 is not None:
			self._year1 = int(year1)
		else:
			self._year1 = None

		if year2 is not None:
			self._year2 = int(year2)
		else:
			self._year2 = None

	def isFAUmachineTeam(self):
		return     (self._firstname == "FAUmachine") \
		       and (self._lastname == "Team")

	def __str__(self):
		s = "Copyright (c)"
		if self._year1 is not None:
			s += " %d" % self._year1
		if self._year2 is not None:
			s +="-%d" % self._year2
		s += " by"
		if self._firstname is not None:
			s += " %s" % self._firstname
		if self._lastname is not None:
			s += " %s" % self._lastname
		if self._email is not None:
			s += " %s" % self._email

		return s

	def __cmp__(self, other):
		# two holders are identical, if first and last name match
		# also sort by lastname then firstname
		if self._lastname != other._lastname:
			return cmp(self._lastname, other._lastname)
		return cmp(self._firstname, other._firstname)

	def __hash__(self):
		s = self._lastname + self._firstname
		return hash(s)


class LicensedFileBase:
	""" base class for all licensed files """

	def __init__(self, path):
		""" c'tor. Path: path to file to check """
		# path to file
		self._path = path
		# copyright holders list
		self._holders = []
		# license, either by shortcut or by full text.
		self._license = ""
		# is it a generated file?
		self._generated = False
		# full txt (usually None)
		self._full_text = None

	def getPath(self):
		""" get pathname to file """
		return self._path

	def isStandard(self):
		""" is this file authored solely by FAUmachine AUTHORS and 
		    distributable under GPL-2+?
		"""
		if     (self._license == "GPL-2+") \
		   and (len(self._holders) == 1) \
		   and (self._holders[0].isFAUmachineTeam()):
		   	return True

		return False

	def getLicenseShortcut(self):
		""" returns the license shortcut (if any)
		"""
		return self._license

	def isGenerated(self):
		""" returns True if the file is generated through another file
		"""
		return self._generated
	
	def _process(self, txt):
		""" set all members by evaluating the textual license txt
		"""
		self._findHolders(txt)

		if LicensedFileBase._isGPL2P(txt):
			self._license = "GPL-2+"
		elif LicensedFileBase._isGPL2only(txt):
			self._license = "GPL-2"
		elif LicensedFileBase._isLGPL2P(txt):
			self._license = "LGPL-2+"
		elif LicensedFileBase._isLGPL21P(txt):
			self._license = "LGPL-2.1+"
		elif LicensedFileBase._isLGPL21only(txt):
			self._license = "LGPL-2.1"
		elif LicensedFileBase._getBSDLicense(txt) is not None:
			self._license = LicensedFileBase._getBSDLicense(txt)
		else:
			self._license = "unknown"

		self._generated = LicensedFileBase._isGenerated(txt)

	def _findHolders(self, txt):
		crLine = """
			Copyright
			(?:[ ]+\([cC]\))?
			(?:[ ]+(?P<year1>[\d]+))?
			(?:-(?P<year2>[\d]+))?
			(?:[ ]+by)?
			[ ]+
			(?P<firstname>[\w]+)
			[ ]+
			(?P<lastname>[\w\.]+[\w]|([\w. ,]+))
			(?:[ ]+(?P<email><[\w.]+@[\w.]+>))?
			[.]?[ ]*[\n]
			"""
		p = re.compile(crLine, re.VERBOSE)

		for m in p.finditer(txt):
			h = CopyrightHolder(**m.groupdict())
			self._holders.append(h)

	def __cmp__(self, other):
		""" comparison method """
		if self._license != other._license:
			return cmp(self._license, other._license)

		l1 = self._holders[:]
		l2 = other._holders[:]
		l1.sort()
		l2.sort()
		return cmp(l1, l2)

	def __str__(self):
		#s = "%s: %s\n" % (self._path, self._shortcut)
		s = ""
		for h in self._holders:
			s += "  %s\n" % h

		s += "  License: %s." % self._license
		return s

	def __hash__(self):
		h1 = hash(self._license)
		for l in self._holders:
			h1 ^= hash(l)

		return h1

	@staticmethod
	def _isGPL2P(txt):
		t = []
		t.append("GNU General Public License")
		t.append("either version 2 of")
		t.append("or (at your option) any later version")

		return LicensedFileBase._matchLicense(t, txt)

	@staticmethod
	def _isGPL2only(txt):
		t = []
		t.append("GNU General Public License")
		t.append("version 2 dated")

		return LicensedFileBase._matchLicense(t, txt)

	@staticmethod
	def _isLGPL2P(txt):
		t = []
		t.append("GNU Lesser General Public License")
		t.append("either version 2 of")
		t.append("or (at your option) any later version.")

		return LicensedFileBase._matchLicense(t, txt)

	@staticmethod
	def _isLGPL21P(txt):
		t = []
		t.append("GNU Lesser General Public License")
		t.append("either version 2.1 of")
		t.append("or (at your option) any later version.")

		return LicensedFileBase._matchLicense(t, txt)

	@staticmethod
	def _isLGPL21only(txt):
		t = []
		t.append("GNU Lesser General Public License")
		t.append("version 2.1 of the License")

		return LicensedFileBase._matchLicense(t, txt)

	@staticmethod
	def _getBSDLicense(txt):
		""" returns None if not BSD like licensed, the full text of the
		    BSD license if so.
		"""
		t = []
		l_begin = "Permission is hereby granted, free of charge"
		l_end = "THE SOFTWARE."

		t.append(l_begin)
		t.append(l_end)
		t.append("without limitation the rights to use, copy, modify,")
		t.append("merge, publish, distribute, sublicense, and")
		if not LicensedFileBase._matchLicense(t, txt):
			return None

		l_begin = LicensedFileBase._txtToRegEx(l_begin)
		l_end = LicensedFileBase._txtToRegEx(l_end)
		l = r"%s[^{]*%s" % (l_begin, l_end)
		p = re.compile(l)
		m = p.search(txt)
		assert m is not None
		
		license = "\n  " + txt[m.span()[0]:m.span()[1]]
		return license

	@staticmethod
	def _txtToRegEx(s):
		s = s.replace(" ", "[\s]+")
		s = s.replace("(", r"[(]")
		s = s.replace(")", r"[)]")
		s = s.replace(".", r"\.")
		return s

	@staticmethod
	def _matchLicense(lines, txt):
		""" return True, if all lines apper in txt, False otherwise. 
		"""
		for s in lines:
			s = LicensedFileBase._txtToRegEx(s)

			p = re.compile(s)
			m = p.search(txt)
			if m is None:
				return False

		return True

	@staticmethod
	def _isGenerated(txt):
		gr = r"([gG]enerated (?:by|from|file)|Generator:){1}"
		m = re.search(gr, txt)

		return m is not None

	@staticmethod
	def _sanitizeText(txt):
		tt = string.maketrans("a", "a")
		delchars = "#/*%;"
		t = txt.translate(tt, delchars)

		return t

class LicensedTextFile(LicensedFileBase):
	""" class for all textual files """
	def __init__(self, path):
		LicensedFileBase.__init__(self, path)
		self._scan()

	def _scan(self):
		try:
			f = file(self._path, "r")
			txt = []
			for i in range(1, SOURCE_LINES):
				txt.append(f.readline())
			f.close()
		except IOError, ioe:
			txt = []
			print >>sys.stderr, "%s: file %s - %s" % (
							sys.argv[0], 
							self._path,
							str(ioe))

		txt = "".join(txt)
		txt = LicensedFileBase._sanitizeText(txt)
		self._process(txt)

class LicensedPNGFile(LicensedFileBase):
	""" class for PNG files """
	def __init__(self, path):
		LicensedFileBase.__init__(self, path)
		self._scan()

	def _scan(self):
		""" take the text from the "Comment" section of texts
		    stored in the PNG file
		"""
		txt = ""
		try:
			f = file(self._path, "r")
		except IOerror, ioe:
			self._process(txt)
			return

		b = f.read(4)
		if len(b) < 4:
			raise Exception("%s is not a PNG file", self._path)

		if (ord(b[0]) != 0x89) or (b[1:4] != "PNG"):
			raise Exception("%s is not a PNG file", self._path)

		# skip remaining header bytes
		f.seek(4, 1);

		while(True):
			# first 32 bit field: length of chunk
			length = f.read(4)
			if len(length) < 4:
				raise Exception("Corrupt file %s", self._path)

			# unpack uint32_t (big endian)
			length = struct.unpack(">L", length)
			assert len(length) == 1
			length = length[0]
			
			# last field: length==0
			if length == 0:
				break

			# type of chunk
			t = f.read(4)
			if len(t) < 4:
				raise Exception("Corrupt file %s", self._path)

			if t != "tEXt":
				# unintersting, skip this chunk
				# also skip crc32 at end of chunk
				f.seek(length + 4, 1)
				continue

			# it is a tEXt chunk
			txt += LicensedPNGFile._readtEXtChunk(f, length)

			# skip crc 32
			f.seek(4, 1)

		f.close()
		self._process(txt)

	@staticmethod
	def _readtEXtChunk(f, length):
		comments = {}
		i = 0
		mode = 0
		sec = ""
		txt = ""

		# text chunks are in the form (key\0value)+
		while (i < length):
			i += 1
			c = f.read(1)
			assert (len(c) == 1)
			if mode == 0:
				if ord(c) == 0:
					mode = 1
					continue
				sec += c
			elif mode == 1:
				if ord(c) == 0:
					mode = 0
					comments[sec] = txt
					sec = ""
					txt = ""
					continue
				txt += c

		# last comment not yet added
		comments[sec] = txt
		# check if there is a "Comment" key and return it.
		if comments.has_key("Comment"):
			return comments["Comment"]

		# default: don't look at other sections
		return ""

class FileRegistry:
	""" keep track of scanned files
	"""
	def __init__(self):
		# plain list with LicensedFileBase of all checked files
		self._files = []
		# dictionary key: file value: list of files
		self._filedict = {}
		# list of glob expressions that denote ignored files
		# (i.e. contents of cvsignore).
		self._ignore_glob = []

	def checkFile(self, path):
		""" check/register file with given path
		"""

		lf = None
		if FileRegistry.isELF(path):
			# skip ELFs
			return
		if FileRegistry.isMachO(path):
			# skip Mach-O binaries
			return
		if FileRegistry.isStaticLib(path):
			# skip static libraries
			return
		if FileRegistry.isTextFile(path):
			lf = LicensedTextFile(path)
		elif FileRegistry.isPNGFile(path):
			lf = LicensedPNGFile(path)
		else:
			lf = LicensedFileBase(path)

		if not lf.isGenerated():
			self._files.append(lf)
			if self._filedict.has_key(lf):
				self._filedict[lf].append(lf)
			else:
				self._filedict[lf] = [ lf ]

	def _equalize_holder_years(self):
		""" for a given group, equalize the key copyright holder to 
		    have the maximum years bounds from all group entries
		"""
		for key, value in self._filedict.iteritems():
			i = 0
			for holder in sorted(key._holders):
				min_y = holder._year1
				max_y = holder._year2

				for lf in value:
					h = sorted(lf._holders)[i]
					assert cmp(h, holder) == 0
					if min_y is None:
						min_y = h._year1
					elif h._year1 is not None:
						min_y = min(min_y, h._year1)
						
					if max_y is None:
						max_y = h._year2
					elif h._year2 is not None:
						max_y = max(max_y, h._year2)
				
				holder._year1 = min_y
				holder._year2 = max_y
				i += 1

	def _filterIgnoredEntries(self, root, lst, local_bl, path_bl):
		""" filter all entries from lst that are ignored, which means
		    that these are either in the _ignore_glob list, or are in
		    local_bl, or are (relative to root) in path_bl.

		    root: directory prefix
		    lst: list of filenames/directory names to reduce
		    local_bl: blacklist of filenames w.o. pathes
		    path_bl: blacklist of filenames with pathes.
		"""
		blacklist = set(local_bl)

		for f in self._ignore_glob:
			blacklist = blacklist.union(fnmatch.filter(lst, f))

		for b in path_bl:
			if (os.path.dirname(b) == root):
				blacklist.add(os.path.basename(b))

		for e in blacklist:
			if e in lst:
				lst.remove(e)

	def _checkVCSIgnore(self, directory):
		self._checkCVSIgnore(directory)

	def _checkCVSIgnore(self, directory):
		p = os.path.join(directory, ".cvsignore")
		
		try:
			f = file(p, "r")
			self._ignore_glob = f.readlines()
			self._ignore_glob = \
				[ x.strip() for x in self._ignore_glob ]
			f.close()
			# temporary swap files
			self._ignore_glob.append("*.swp")
			# cvs conflicts
			self._ignore_glob.append(".#*")
			# temporary files
			self._ignore_glob.append("*~")
		except IOError, e:
			# file doesn't exist, that's ok here.
			self._ignore_glob = []

	def traverse(self):
		for root, dirs, files in os.walk("."):
			self._checkVCSIgnore(root)
			self._filterIgnoredEntries(root, 
						dirs, 
						BLACKLISTED_DIRS,
						BLACKLISTED_PATHDIRS)
			self._filterIgnoredEntries(root,
						files,
						BLACKLISTED_FILES,
						BLACKLISTED_PATHFILES)

			files.sort();
			dirs.sort();
			for f in files:
				self.checkFile(os.path.join(root, f))

		self._equalize_holder_years()

	def __str__(self):
		ret = ""
		# sort by path name of key item.
		items1 = self._filedict.iteritems()
		items = sorted(items1, key=lambda x: x[0]._path)

		for key, valuelist in items:
			prefixes = ["* "]
			prefixes += ["  " for x in valuelist[1:]]

			suffixes = (["," for x in valuelist[:-1]])
			suffixes.append(":")
			valuelist.sort(key=lambda x:x._path)

			for p, i, s in zip(prefixes, valuelist, suffixes):
				ret += "%s%s%s\n" % (p,i.getPath(),s)
			ret += "\n"
			ret += str(key)
			ret += "\n\n"
		return ret

	def filterOutDefault(self, firstname, lastname, license):
		lf = LicensedFileBase("dummy")
		lf._holders.append(
			CopyrightHolder(firstname, lastname, "", 0, 0))
		lf._license = license

		if self._filedict.has_key(lf):
			del(self._filedict[lf])

	@staticmethod
	def isELF(path):
		try:
			# tiny magic: check for ELF magic bytes
			f = file(path, "r")
			b = f.read(4)
			f.close()
			if (len(b) >= 4):
				if (ord(b[0]) == 0x7F) and (b[1:4] == "ELF"):
					return True
		except IOError, ioe:
			return False

	@staticmethod
	def isMachO(path):
		try:
			f = file(path, "r")
			b = f.read(4)
			f.close()
			if (len(b) >= 4):
				if (ord(b[0]) == 0xCE) and (ord(b[1]) == 0xFA) and (ord(b[2]) == 0xED) and (ord(b[3]) == 0xFE):
					return True
				if (ord(b[0]) == 0xCF) and (ord(b[1]) == 0xFA) and (ord(b[2]) == 0xED) and (ord(b[3]) == 0xFE):
					return True
		except IOError, ioe:
			return False

	@staticmethod
	def isStaticLib(path):
		if path.endswith(".a"):
			return True
		return False

	@staticmethod
	def isTextFile(path):
		# quick filter for binary files.
		if path.endswith(".o"):
			return False
		if path.endswith(".a"):
			return False
		if path.endswith(".rom"):
			return False
		if path.endswith(".png"):
			return False

		return True

	@staticmethod
	def isPNGFile(path):
		""" is path a PNG file?
		"""
		if not path.endswith(".png"):
			return False

		try:
			f = file(path, "r")
			b = f.read(4)
			f.close()
			if (len(b) >= 4):
				if (ord(b[0]) == 0x89) and (b[1:4] == "PNG"):
					return True
		except IOError, ioe:
			return False

		return False


class GlueCopyright:
	def __init__(self, reffile="copyright"):
		self._reffile = reffile

	@staticmethod
	def _needCheck():
		files = os.listdir(".")
		if "copyright" not in files:
			print >>sys.stderr, \
				"%s: Please call from top directory." % \
				sys.argv[0]
			print >>sys.stderr, "%s: Skipping check." % \
				sys.argv[0]
			return False

		if ".cvsignore" not in files:
			print >>sys.stderr, \
				"%s: .cvsignore not found, not checking." % \
				sys.argv[0]
			return False

		return True

	def _printDiff(self, reference, generated):
		tref = reference.split("\n")
		tgen = generated.split("\n")
		print ""

		delta = difflib.unified_diff(tref, 
						tgen, 
						self._reffile,
						"generated output",
						lineterm="")
		for l in delta:
			print l

		print ""
		print ""

	def performCheck(self):
		""" performs the check, returns True if succeeded, or if
		    check was skipped (e.g. if not a checkout), false if
		    the reference file is not identical.
		"""
		if not GlueCopyright._needCheck():
			return True

		checkText = self.genCopyright()
		f = file(self._reffile, "r")
		ref_text = f.readlines()
		f.close()

		ref_text = "".join(ref_text)
		ref_text = ref_text.strip()
		checkText = checkText.strip()
		if ref_text != checkText:
			print >> sys.stderr, \
				"%s: WARNING: License texts mismatch" %\
				sys.argv[0]
			self._printDiff(ref_text, checkText)
			return False

		return True

	def genCopyright(self):
		""" return copyright text string """
		fr = FileRegistry()
		fr.traverse()
		fr.filterOutDefault("FAUmachine", "Team", "GPL-2+")

		preface = \
"""FAUmachine's sources are available at <http://www.faumachine.org>.

All files, including packaging (exceptions see below):

FAUmachine is 
  Copyright (C) 2000-2010 Friedrich Alexander University Erlangen-Nuremberg,
                          Germany - Department of Computer Science 3,
			  Volkmar Sieh, et al. (See AUTHORS).

 In the project files, as well as the remainder of this document,
 the term "FAUmachine Team" is used to denote the persons listed in the
 AUTHORS file as copyright holders, together with the Friedrich Alexander
 University Elangen-Nuremberg, Germany - Department of Computer Science 3,
 since a number of authors worked on FAUmachine as part of the work at 
 the university.

 FAUmachine comes with ABSOLUTELY NO WARRANTY.
 This is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the
 Free Software Foundation; either version 2, or (at your option)
 any later version. Look at COPYING for details.

On Debian systems, the full text of the GPL, version 2 can be found at
'/usr/share/common-licenses/GPL-2.'

In the remainder of this document the following abbreviations are used:

GPL-2:
 This program is free software; you can redistribute it and/or modify it 
 under the terms of the GNU General Public License as published by the
 Free Software Foundation, version 2.

On Debian systems, see '/usr/share/common-licenses/GPL-2.' for the full text
of the GNU General Public License, version 2.

GPL-2+:
 This program is free software. You can redistribute it and/or modify it
 under the terms of the GNU General Public License, either version 2 of
 the License, or (at your option) any later version.

On Debian systems, see '/usr/share/common-licenses/GPL' for the full text
of the GNU General Public License in its latest version.

LGPL-2+:
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Library General Public
 License as published by the Free Software Foundation; either
 version 2 of the License, or (at your option) any later version.

On Debian systems, see '/usr/share/common-licenses/LGPL' for the full text
of the GNU Lesser General Public License in its latest version.

LGPL-2.1+:
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

On Debian systems, see '/usr/share/common-licenses/LGPL' for the full text
of the GNU Lesser General Public License in its latest version.

The files have a different license and/or different authors:
"""
		special_files = \
"""* ./lib/keymaps/*:

  Based on keymaps from the rdesktop package and modified for FAUmachine.
  Copyright (C) FAUmachine team,
  Copyright (C) Matthew Chapman 1999-2000.
  License: GPL-2, with the following additional exemption:

  This software is released under the GNU General Public License
  (reproduced below) with the additional exemption that compiling,
  linking, and/or using OpenSSL together with this software is
  allowed."""
		output_text = "%s%s%s" % (preface, str(fr), special_files)
		return output_text
	
if __name__ == '__main__':
	gc = GlueCopyright()
	if len(sys.argv) == 2:
		if sys.argv[1] == "-p":
			print gc.genCopyright()
			sys.exit(0)
	ret = gc.performCheck()
	if ret:
		sys.exit(0)
	sys.exit(1)
