See these two posts for a background:
Here is some output of code I've written in python. Ask if you want to run it. This output is generated by input $s = abc\cdot abc \cdot abc$.
# generators = 6:
[A -> cab, B -> bca, C -> ab, D -> bc, E -> abc, F -> ca]
# group elems = 107:
{1, A, B, C, D, E, F, AA, AB, AC, AD, AE, AF, BA, BB,
BC, BD, BF, CA, CB, CC, CD, CE, CF, DA, DB, DC, DD, DE, DF, EA, EB, EC, ED,
EE, EF, FC, FD, FE, FF, AAC, AAF, ABF, ACC, ACD, ACF, ADF, AEC, AEF, AFF,
BAC, BAF, BBF, BCF, BDF, BFF, CAC, CAF, CBF, CCC, CCD, CCF, CDC, CDD, CDF,
CEC, CFC, CFD, CFF, DAC, DAF, DBF, DCC, DCD, DCF, DDC, DDD, DDF, DEC, DFC,
DFD, DFF, EAC, ECC, ECD, ECF, EEC, FCC, FCD, FCF, FDC, FDD, FDF, FEC, FFC,
FFD, FFF, AACF, ACCF, ACDF, ACFF, AECF, BACF, CACF, DACF, FFA, FFB, FFAC}
The generators of the group are defined to be the irreducible substrings of the input string $s$. If we further say that these must be compressibly repeating (as non-compressible repeating substrings don't reduce grammar size), then:
# generators = 5:
[A -> cab, B -> bca, C -> ab, D -> bc, E -> abc]
# group elems = 69:
{1, A, B, C, D, E, AA, AC, AD, AE, BA, BB, BC, BD, BE, CB, CC, CD, CE, DA,
DB, DC, DD, DE, EA, EB, EC, ED, EE, AAC, ACC, ACD, ACE, BBD, BCD, BDD, BDE,
CBD, CCC, CCD, CCE, CDC, CDD, CDE, CEC, CED, CEE, DAC, DBD, DCC, DCD, DCE,
DDC, DDD, DDE, DEC, DED, DEE, EAC, EBD, ECC, ECD, ECE, EDC, EDD, EDE, EEC,
EED, EEE, CCA}
So as you can see, just by that one small change we nearly cut the size of the action group in half.
Was wondering what other methods can be used to reduce the size of the action group and thus the computation time of a recursive smallest grammar algorithm.
Details
We define the left action of a grammar rule such as $A\to abc$, on the string $s = abcabc$ (for example) by taking the leftmost occurrence of $abc \leqslant s$ and collapsing it to the variable $A$. Applying $A$ twice yields $A^2 \circ s = AA$ in this case. Applying a 3rd time restarts the process via expanding all $A$ occurences. Since a smallest grammar expanded fully (up to pure literal rules) always has a leftmost variable occurence, or if it doesn't, you can transfer over to one that does, we believe one could enumerate all possible near-fully expanded grammars by acting on the start string $s$ with the set of all irreducible (and compressibly repeating) rules. Once you have this enumeration, you then recurse (see the above links). Of course, without any more study of this, the algorithm does appear to be non-polynomial in complexity class.
Is there a way to filter out more "fruitless" elements from the group $G(s)$?
For example, I'm next going to look into when $\text{RHS}(A) \leqslant \text{RHS}(B)$ as strings. Since then you can't have both literal rules appear in a smallest grammar.
Here is some of the code.
import re # For creating unique variable names
import copy
import sys
from tools import replaceSublist, findSublists
def substringIndices(string, min_length=1):
substring = {}
for i in range(0, len(string)):
for j in range(min_length, len(string) - i + 1):
s = string[i: i + j]
if s not in substring:
substring[s] = [i]
else:
substring[s].append(i)
return substring
def isReducibleAsSubstring(substr, indices, compressible=False):
length = len(substr)
i = indices[0]
for j in range(1, len(indices)):
if i + length <= indices[j]:
if not compressible:
return True
elif (length == 2 and j > 1) or length >= 3:
return True
return False
def substringsReducibleInString(string, compressible=False):
substring = substringIndices(string, min_length=2)
reducible = {}
for s, indices in substring.items():
if isReducibleAsSubstring(s, indices, compressible):
reducible[s] = indices
return reducible
def isIrreducible(string, incompressible=False):
substring = substringsReducibleInString(string, compressible=incompressible)
if len(substring) == 0:
return True
return False
def irreducibleReduciblyRepeatingSubstrings(string, incompressibles=False, compressibly_rep=False):
substring = substringsReducibleInString(string, compressibly_rep)
irreducibles = {}
for s, indices in substring.items():
if isIrreducible(s, incompressibles):
irreducibles[s] = indices
return irreducibles
class SymbolString:
def __init__(self, string=None, vec=None):
if vec is None:
self.vector = list(string)
else:
self.vector = vec
# num = -1 means all
def replace(self, old, new, num=None):
replaceSublist(self.vector, old.vector, new.vector, maxreplace=num)
def contains(self, sym_str):
res = findSublists(self.vector, sym_str.vector)
if res is None:
return False
return True
# For determining if a symbol string is already listed
# TODO: make efficient with dictionary lookup. Should work if we convert back to string
# Assuming variables were created, knowing all substrings so that A1 doesn't occur for sure
# as both a variable and a literal substring
def equals(self, string):
if len(self.vector) != len(string.vector):
return False
else:
for k in range(0, len(self.vector)):
if self.vector[k] != string.vector[k]:
return False
return True
def __repr__(self):
string = ''
for c in self.vector:
string += c
return string
class GrammarRule:
def __init__(self, var, rhs):
self.variable = var
self.RHS = rhs
def __repr__(self):
return self.variable + ' -> ' + str(self.RHS)
#TODO: delete if not needed, or add in rule creation / deletion
def leftmostActionOnGrammar(self, grammar):
rules = list(grammar.rules)
rules.remove(grammar.S)
start_rule = self.leftmostActionOnRule(grammar.S)
rules = rules.insert(0, start_rule)
grammar = Grammar(rules, start_rule)
return grammar
def leftmostActionOnRule(self, rule):
res_str = self.leftmostActionOnString(rule.RHS)
return GrammarRule(rule.var, res_str)
#TODO refactor: make SymbolStrings into just lists of strings
def leftmostActionOnString(self, sym_str):
res_str = copy.copy(sym_str)
# If substring even occurs
if res_str.contains(self.RHS):
res_str.replace(self.RHS, SymbolString(vec=[self.variable]), num=1)
else:
res_str.replace(SymbolString(vec=[self.variable]), self.RHS, num=None)
return res_str
class Grammar:
def __init__(self, rules, start_rule=None):
self.rules = rules
if start_rule is None:
start_rule = rules[0]
self.S = start_rule
# assumes current_vars is ordered (A, ..., Z, A1, ... Z1, A2, ..., Z2, ...)
varIndexRegex = re.compile(r'(\d+)$')
def createUniqueGrammarVar(current_vars=None):
#TODO modify to handle when capital and digits used in string
# currently only works with formal language convention of strings
# of all lower alphas: 'aabcaddsadabadfsdfa' e.g.
if current_vars in [None, []]:
A = 'A'
else:
if current_vars is None:
current_vars = []
highest_var = current_vars[-1]
search = varIndexRegex.search(highest_var)
if search is None:
if highest_var == 'Z':
A = 'A1'
else:
A = chr(ord(highest_var) + 1)
else:
# found index at end:
span = search.span()
var = highest_var[0:span[0]]
index = int(search.group())
if highest_var == 'Z' + str(index):
A = 'A' + str(index + 1)
else:
A = chr(ord(var) + 1) + str(index)
current_vars.append(A)
return A, current_vars
def createGroupGeneratorRules(strings):
grammar_rules = []
grammar_vars = []
for s in strings:
A, grammar_vars = createUniqueGrammarVar(current_vars=grammar_vars)
grammar_rules.append(GrammarRule(A, SymbolString(s)))
return grammar_rules
# TODO: probably refactor above classes so that
# there is a SGAG element class that acts on SymbolString
class SmallestGrammarAlgorithmGroup:
def __init__(self, string, incompressibles=False, compressibly_rep=False):
irreducibles = irreducibleReduciblyRepeatingSubstrings(string, incompressibles, compressibly_rep)
self.generatorRules = createGroupGeneratorRules(irreducibles.keys())
self.variables = [R.variable for R in self.generatorRules]
self.generators = [(x,) for x in self.generatorRules]
self.elements = copy.copy(self.generators)
self.string = SymbolString(string)
self.stringOrbit = None
def computeInitialOrbit(self):
s = self.string
orbit = []
for x in self.elements:
s1 = self.actOnLeftByElement(x, s)
orbit.append(s1) # Yes, should automatically be unique already by definition of substring functions
return orbit
def generateGroup(self):
if self.stringOrbit is None:
self.stringOrbit = self.computeInitialOrbit()
size = len(self.elements)
self._mulEq(self)
while len(self.elements) > size:
size = len(self.elements)
self._mulEq(self)
def _mulEq(self, group):
elements = self.elements
elements_ext = []
s = self.string
for x in self.elements:
for y in group.elements:
z = x + y # tuple concat +
s1 = self.actOnLeftByElement(z, s)
if not self.orbitContains(s1):
self.stringOrbit.append(s1)
elements_ext.append(z)
self.elements.extend(elements_ext)
def orbitContains(self, string):
for orb_str in self.stringOrbit:
if orb_str.equals(string):
return True
return False
def actOnLeftByElement(self, elem_tuple, sym_str):
res_str = copy.deepcopy(sym_str)
for x in elem_tuple:
res_str = x.leftmostActionOnString(res_str)
return res_str
def __str__(self):
string = '{'
for elem in self.elements:
# TODO: refactor into Elem class to do this as well:
elem_str = ''
for x in elem:
elem_str += x.variable
string += elem_str + ', '
string = string[0:-2] # Take off last comma/space
string += '}'
return string
Email me at fruitfulapproach (google mail) if you want help running it and the other files required.
Some observations.
$\#_s(A) + 1$ is always equal to the order of $A$, by definition.
The group $G(S)$ reflects properties of $G \circ S$.
- Conjecture: $\langle A \rangle$ is normal in $G(S)$ if and only if we can construct a smallest grammar by maximizing copies of $A$ that occur.