1005 Programming Pattern
Statement
Metadata
- 作者: HOU, Qiming
- 单位: 浙江大学
- 代码长度限制: 16 KB
- 时间限制: 800 ms
- 内存限制: 64 MB
Programmers often have a preference among program constructs. For example, some may prefer if(0==a)
, while others may prefer if(!a)
. Analyzing such patterns can help to narrow down a programmer's identity, which is useful for detecting plagiarism.
Now given some text sampled from someone's program, can you find the person's most commonly used pattern of a specific length?
Input Specification
Each input file contains one test case. For each case, there is one line consisting of the pattern length \n
. The entire input is case sensitive.
Output Specification
For each test case, print in one line the length-
Whitespace characters in the input should be printed as they are. Also note that there may be multiple occurrences of the same substring overlapping each other.
Sample Input 1
Sample Output 1
Sample Input 2
Sample Output 2
Tutorial
题意:
给出一个字符串
并且还要输出出现次数。
思路:
考虑跑一个 SA
,然后发现长度为
并且字典序越小的,后缀排名肯定越小,所以扫一遍
Solution
#include <bits/stdc++.h>
using namespace std;
const int N = 1048576 * 3 + 10;
int n;
char s[N];
struct DA {
//求SA数组需要用到的中间变量,不需要赋值
// c数组的范围为m + 1
int t1[N], t2[N], c[N];
int sa[N];
int Rank[N];
int height[N];
//待排序的字符串放在str数组中,从str[0] - s[n - 1]长度为n, 最大值小于m
int str[N];
int n, m;
void init(char *s, int m, int n) {
this->m = m;
this->n = n;
for (int i = 0; i < n; ++i) str[i] = s[i];
str[n] = 0;
}
bool cmp(int *r, int a, int b, int l) {
return r[a] == r[b] && r[a + l] == r[b + l];
}
void work() {
++n;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for (i = 0; i < m; ++i) c[i] = 0;
for (i = 0; i < n; ++i) c[x[i] = str[i]]++;
for (i = 1; i < m; ++i) c[i] += c[i - 1];
for (i = n - 1; i >= 0; --i) sa[--c[x[i]]] = i;
for (j = 1; j <= n; j <<= 1) {
p = 0;
//直接利用sa数组排序第二关键字
//后面的j个数第二关键字为空的最小
for (i = n - j; i < n; ++i) {
y[p++] = i;
}
for (i = 0; i < n; ++i)
if (sa[i] >= j)
y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for (i = 0; i < m; ++i) c[i] = 0;
for (i = 0; i < n; ++i) c[x[y[i]]]++;
for (i = 1; i < m; ++i) c[i] += c[i - 1];
for (i = n - 1; i >= 0; --i) sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x, y);
p = 1;
x[sa[0]] = 0;
for (i = 1; i < n; ++i) {
x[sa[i]] = cmp(y, sa[i - 1], sa[i], j) ? p - 1 : p++;
}
if (p >= n)
break;
//下次基数排序的最大值
m = p;
}
int k = 0;
--n;
for (i = 0; i <= n; ++i) Rank[sa[i]] = i;
// build height
for (i = 0; i < n; ++i) {
if (k)
--k;
j = sa[Rank[i] - 1];
while (str[i + k] == str[j + k]) ++k;
height[Rank[i]] = k;
}
}
} da;
int main() {
scanf("%d\n%[^\n]", &n, s);
int len = strlen(s);
da.init(s, 128, len);
da.work();
// da.work(s, len, 128);
int pre = 1, Max = 1, pos = da.sa[1];
for (int i = 2; i <= len; ++i) {
if (da.height[i] >= n) {
++pre;
if (pre > Max) {
Max = pre;
pos = da.sa[i];
}
} else {
pre = 1;
}
}
s[pos + n] = 0;
printf("%s %d\n", s + pos, Max);
return 0;
}
Tutorial1
考虑 Hash
,求出现的最大次数。
比较字典序,直接二分 lcp
进行比较即可,比较的次数不会超过
本来用 map
做,有个点一直 TLE
,用 unordered_map
也无济于事。
后来发现可以直接 sort
,然后扫一遍就 WIN
了。
Solution1
#include <bits/stdc++.h>
using namespace std;
#define fi first
#define se second
#define SZ(x) (int(x.size()))
using pII = pair<int, int>;
const int N = 1.1e6 + 10, INF = 0x3f3f3f3f;
int n;
string s;
struct Hash {
int seed, mod;
int base[N], a[N];
void init(int _seed, int _mod, const string &s) {
seed = _seed;
mod = _mod;
base[0] = 1;
for (int i = 1; i < N; ++i) {
base[i] = 1ll * base[i - 1] * seed % mod;
}
a[0] = 0;
for (int i = 1; i < SZ(s); ++i) {
a[i] = (1ll * a[i - 1] * seed % mod + s[i]) % mod;
}
}
inline int get(int l, int r) {
return (a[r] - 1ll * a[l - 1] * base[r - l + 1] % mod + mod) % mod;
}
} hs;
inline bool ok(int x, int y) {
if (hs.get(x, x + n - 1) == hs.get(y, y + n - 1))
return false;
int l = 1, r = n, res = 0;
while (r - l >= 0) {
int mid = (l + r) >> 1;
if (hs.get(x, x + mid - 1) == hs.get(y, y + mid - 1)) {
res = mid;
l = mid + 1;
} else {
r = mid - 1;
}
}
if (res < n && s[x + res] > s[y + res])
return true;
return false;
}
struct node {
int ix, val, num;
node() {}
node(int ix, int val, int num) : ix(ix), val(val), num(num) {}
};
int main() {
ios::sync_with_stdio(false);
cin.tie(0);
cout.tie(0);
cin >> n;
getline(cin, s);
getline(cin, s);
s.insert(0, "0");
hs.init(233, 998244353, s);
vector<node> vec, _vec;
for (int i = 1; i + n <= SZ(s); ++i) {
vec.emplace_back(i, hs.get(i, i + n - 1), 1);
}
sort(vec.begin(), vec.end(), [&](node a, node b) {
return a.val < b.val;
});
_vec.push_back(vec.back());
vec.pop_back();
while (!vec.empty()) {
node tmp = vec.back();
vec.pop_back();
if (tmp.val == _vec.back().val)
++_vec.back().num;
else
_vec.push_back(tmp);
}
sort(_vec.begin(), _vec.end(), [&](node a, node b) {
return a.num > b.num;
});
node res = _vec[0];
for (int i = 1; i < SZ(_vec); ++i) {
node now = _vec[i];
if (res.num > now.num)
break;
else if (ok(res.ix, now.ix))
res = now;
}
cout << s.substr(res.ix, n) << " " << res.num << "\n";
return 0;
}
Tutorial2
想不通为什么这些「暴力」可以过。
Solution2
#include <bits/extc++.h>
#include <bits/stdc++.h>
using namespace std;
using namespace __gnu_pbds;
#define fi first
#define se second
const int N = 1048576 + 10;
int n;
string s;
typedef unsigned int ull;
// map<ull, int> mp, id;
struct Hash {
static ull base[N];
static void init() {
base[0] = 1;
for (int i = 1; i < N; ++i) base[i] = base[i - 1] * 131;
}
ull a[N];
//最好改成从1开始,因为查询区间Hash值是前缀和形式
inline void gao(string &s) {
a[0] = 0;
int len = s.size();
for (int i = 1; i <= len; ++i) {
a[i] = a[i - 1] * 131 + (s[i - 1] + 1);
}
}
inline ull get(int l, int r) {
++l, ++r;
return a[r] - a[l - 1] * base[r - l + 1];
}
} hs;
ull Hash::base[N] = {0};
unordered_map<ull, int> mp;
int main() {
Hash::init();
ios::sync_with_stdio(false);
cin.tie(0);
cout.tie(0);
cin >> n;
getline(cin, s);
getline(cin, s);
hs.gao(s);
int Max = 0, pos = -1;
int len = s.size();
for (int i = 0; i + n - 1 < len; ++i) {
ull val = hs.get(i, i + n - 1);
++mp[val];
if (mp[val] > Max) {
Max = mp[val];
pos = i;
} else if (mp[val] == Max) {
for (int j = 0; j < n; ++j)
if (s[pos + j] != s[i + j]) {
if (s[pos + j] > s[i + j])
pos = i;
break;
}
}
}
cout << s.substr(pos, n) << " " << Max << "\n";
return 0;
}