postgresql+repmgr+keepalived (16.0)

ip6 postgresql 5432:用于接收客户端连接请求
ip7 postgresql 5432:用于接收客户端连接请求

前置准备工作(ip6/ip7 两台服务器执行相同操作)

1
2
3
4
5
6
7
8
9
10
11
mkdir -p /data
adduser app
passwd app

# 设置密码如下
yourpassword

chown -R app:app /data
vim /etc/sudoers

app ALL = (root) NOPASSWD:/usr/bin/systemctl start postgresql.service,/usr/bin/systemctl restart postgresql.service,/usr/bin/systemctl stop postgresql.service,/usr/bin/systemctl reload postgresql.service,/usr/bin/systemctl status postgresql.service
1
2
3
4
vim /eto/hosts

ip6 storageServer3
ip7 storageServer4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
ssh-keygen -t rsa -b 4096
cd ~/.ssh
cat id_rsa.pub

# 在第一台服务器上创建authorized keys
vim authorized_keys

# 同一组别内不能赋予写权限,否则免密失败
chmod 600 ~/.ssh/authorized_keys

# 把其他服务器的密钥id_rsa.pub加入authorized_keys
# 分发到其他服务器上
scp /home/app/.ssh/authorized_keys pguser@{目标IP}:/home/app/.ssh/authorized_keys

# 用ssh测试服务器之间连通性
ssh app@{目标IP}

postgresql

1
2
3
4
5
6
7
8
9
yum install -y gcc gcc-c++&&
yum install -y libicu-dev &&
yum install -y readline-devel&&
yum install -y zlib-devel

./configure --prefix=/data/postgresql16 --without-icu

make && make install

  • postgresql systemd
1
2
3
4
5
6
7
cd /usr/lib/systemd/system/

vim postgresql.service

//配置自启
systemctl deamon-reload
systemctl enable postgresql.service
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[Unit]
Description=postgresql.service
After=network.target

[Service]
Type=forking
User=app
Group=app
WorkingDirectory=/data/postgresql16
ExecStart=/data/postgresql16/bin/pg_ctl start -D /data/postgresql16/data
ExecReload=/data/postgresql16/bin/pg_ctl restart -D /data/postgresql16/data
ExecStop=/data/postgresql16/bin/pg_ctl stop -D /data/postgresql16/data
Restart=always
RestartSec=3

[Install]
WantedBy=multi-user.target

  • postgresql初始化
1
2
3
4
5
6
# 从节点不需要注册
initdb -Upostgres -W -D /data/postgresql16/data -k

# 密码如下
yourpassword

  • postgresql.conf配置修改

  • vim /data/postgresql16/data/postgresql.conf

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# 监听地址,'0.0.0.0' 表示监听所有可用的 IP 地址;
listen_addresses = '0.0.0.0'

# 最大连接数,100 为默认值,修改需要重新启动 PostgreSQL;
max_connections = 100

# UNIX 套接字目录,修改需要重新启动 PostgreSQL;
unix_socket_directories = '/tmp'

# 共享缓冲区大小,至少为 128MB;
shared_buffers = 128MB

# 动态共享内存类型,默认通常是 posix;
dynamic_shared_memory_type = posix

# WAL 级别,可选值为 'minimal', 'replica', 或 'logical';
wal_level = 'hot_standby'

# 同时进行非关键更新的完整页写入;
wal_log_hints = on

# 最大 WAL 文件大小为 1GB;
max_wal_size = 1GB

# 最小 WAL 文件大小为 80MB;
min_wal_size = 80MB

# 启用归档模式,可选值为 off, on, 或 always;
archive_mode = on

# 用于归档 WAL 文件的命令,'/bin/true' 表示禁用归档;
archive_command = 'cp %p /data/postgresql16/archive/%f'

# 最大的 WAL 发送者进程数;
max_wal_senders = 10

# 热备模式,on 表示允许在恢复期间执行查询,off 则禁止;
hot_standby = on

# 日志时区,'Asia/Shanghai' 表示上海时区;
log_timezone = 'Asia/Shanghai'

# 日期样式,'iso, mdy' 表示国际标准日期格式;
datestyle = 'iso, mdy'

# 时区,'Asia/Shanghai' 表示上海时区;
timezone = 'Asia/Shanghai'

# 系统错误消息的区域设置;
lc_messages = 'en_US.UTF-8'

# 货币格式的区域设置;
lc_monetary = 'en_US.UTF-8'

# 数字格式的区域设置;
lc_numeric = 'en_US.UTF-8'

# 时间格式的区域设置;
lc_time = 'en_US.UTF-8'

# 默认文本搜索配置;
default_text_search_config = 'pg_catalog.english'

# 预加载库,'repmgr' 表示预加载 repmgr 模块,修改需要重新启动 PostgreSQL;
shared_preload_libraries = 'repmgr'

  • pg_hba.conf追加
1
2
3
4
5
6
7
8
9
10
11
12
# Allow replication connections from localhost, by a user with the
# replication privilege.
local replication all trust
host replication all 127.0.0.1/32 trust
host replication all ::1/128 trust

host all all ip6/24 trust
host all all ip7/24 trust
host replication repmgr ip7/24 trust
host replication repmgr ip7/24 trust

host all all 0.0.0.0/0 md5

repmgr

  • repmgr解压
1
2
3
4
tar -zxvf repmgr-5.4.1.tar.gz -C /data
mv repmgr-5.4.1/ repmgr541
chown -R postgres:postgres /data/repmgr541/

  • repmgr编译
1
2
3
4
5
6
7
8
9
10
11
12
13
14
yum install libcurl-devel

yum install json-c-devel
# 如果安装失败,上传json安装包后执行以下操作
rpm -ivh json-c-devel-0.11-4.el7_0.x86_64.rpm

yum install flex
export PG_CONFIG=/data/postgresql16/bin/pg_config
./configure
make && make install

# 编译成功后,执行以下命令可以发现两个repmgr文件
ll /data/postgresql16/bin | grep repmgr

  • repmgr配置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
cd /data/postgresql16
vim repmgr.conf

#repmgr.conf主节点ip6内容
node_id=1
node_name=node1
conninfo='host=storageserver3 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/data/postgresql16/data'

config_directory='/data/postgresql16/repmgr.conf'
log_level='INFO'
log_facility='STDERR'
log_file='/data/postgresql16/repmgr.log'
promote_command='/data/postgresql16/bin/repmgr standby promote -f /data/postgresql16/repmgr.conf'
follow_command='/data/postgresql16/bin/repmgr standby follow -f /data/postgresql16/repmgr.conf -W --upstream-node-id=%n'
failover='automatic'
service_start_command = 'sudo systemctl start postgresql.service'
service_stop_command = 'sudo systemctl stop postgresql.service'
service_restart_command = 'sudo systemctl restart postgresql.service'
service_reload_command = 'sudo systemctl reload postgresql.service'
repmgrd_service_start_command = '/data/postgresql16/bin/repmgrd -f /data/postgresql16/repmgr.conf --pid-file /tmp/repmgrd.pid --daemonize'
repmgrd_service_stop_command = 'kill `cat /tmp/repmgrd.pid`'

#repmgr.conf从节点ip7内容
node_id=2
node_name=node2
conninfo='host=storageserver4 user=repmgr dbname=repmgr connect_timeout=2'
data_directory='/data/postgresql16/data'

config_directory='/data/postgresql16/repmgr.conf'
log_level='INFO'
log_facility='STDERR'
log_file='/data/postgresql16/repmgr.log'
promote_command='/data/postgresql16/bin/repmgr standby promote -f /data/postgresql16/repmgr.conf'
follow_command='/data/postgresql16/bin/repmgr standby follow -f /data/postgresql16/repmgr.conf -W --upstream-node-id=%n'
failover='automatic'
service_start_command = 'sudo systemctl start postgresql.service'
service_stop_command = 'sudo systemctl stop postgresql.service'
service_restart_command = 'sudo systemctl restart postgresql.service'
service_reload_command = 'sudo systemctl reload postgresql.service'
repmgrd_service_start_command = '/data/postgresql16/bin/repmgrd -f /data/postgresql16/repmgr.conf --pid-file /tmp/repmgrd.pid --daemonize'
repmgrd_service_stop_command = 'kill `cat /tmp/repmgrd.pid`'

  • repmgrd systemd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[Unit]
Description=repmgrd.service
After=network.target postgresql.service

[Service]
Type=forking
User=postgres
Group=postgres
WorkingDirectory=/data/postgresql16
ExecStart=/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf daemon start
ExecStop=/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf daemon stop
ExecReload=/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf daemon stop
Restart=always
RestartSec=3

[Install]
WantedBy=multi-user.target

  • 在主服务器ip6的PostgreSQL上创建一个专用超级用户repmgr和一个专用的数据库repmgr
1
2
3
4
5
/data/postgresql16/bin/createuser -s repmgr

/data/postgresql16/bin/createdb repmgr -O repmgr

/data/postgresql16/bin/psql -p5432 -h127.0.0.1 -Upostgres -c 'ALTER USER repmgr SET search_path TO repmgr, "$user", public;'
  • 拷贝相关安装包到从节点ip7服务器
1
2
3
scp ./json-c-devel-0.11-4.el7_0.x86_64.rpm app@storageserver4:/data/upload_dir/

/data/postgresql16/bin/psql 'host=storageserver3 user=postgres dbname=postgres connect_timeout=2'
  • 主节点ip6进行注册
1
2
3
4
5
6
7
8
9
10
/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf primary register --force

#查看集群状态
/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf cluster show

#查看repmgr元数据表
/data/postgresql16/bin/psql -p5432 -h127.0.0.1 -Urepmgr
\x
SELECT * FROM repmgr.nodes;

  • 从节点ip7进行克隆
1
2
3
4
5
6
7
8
9
# 清空从节点数据
rm -rf /data/postgresql16/data/*

# 从节点上进行测试能否克隆
/data/postgresql16/bin/repmgr -h storageserver3 -U repmgr -d repmgr -f /data/postgresql16/repmgr.conf standby clone --dry-run

# 满足条件则执行克隆
cd /data/postgresql16/bin
/data/postgresql16/bin/repmgr -h storageserver3 -U repmgr -d repmgr -f /data/postgresql16/repmgr.conf standby clone
  • 从节点ip7注册
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 启动从节点PGSQL
sudo systemctl start postgresql

# 注册从节点
/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf standby register --force

# 启动从节点postgresql
sudo systemctl start postgresql

# 查看repmgr元数据表
/data/postgresql16/bin/psql -p5432 -h127.0.0.1 -Urepmgr
\x
SELECT * FROM pg_stat_wal_receiver;

# 验证是否正常注册 【主节点】执行
/data/postgresql16/bin/psql -p5432 -h127.0.0.1 -Urepmgr
\x
SELECT * FROM pg_stat_replication;

# 查看集群状态
/data/postgresql16/bin/repmgr -f /data/postgresql16/repmgr.conf cluster show

keepalived

  • 在ip6和ip6上安装keepalived
1
2
3
./configure --prefix=/data/keepalived

make && make install
  • keepalived systemd
1
2
3
4
5
# keepalived.service在安装后会自动生成
vim /lib/systemd/system/keepalived.service

# 只需要更改以下内容
ExecStart=/opt/app/keepalived/sbin/keepalived -f /data/keepalived/etc/keepalived/keepalived.conf $KEEPALIVED_OPTIONS
  • 查看服务器物理端口
1
2
3
4
5
6
7
8
9
ip a

#选择有ipv4和ipv6的物理端口,否则会报错如下

entering FAULT state (interface ens25f0 down)
Keepalived_vrrp[35803]: (VI_1_BACKUP) entering FAULT state (no IPv4 address for interface)
Keepalived_vrrp[35803]: (VI_1_BACKUP) entering FAULT state
Keepalived_vrrp[35803]: Registering gratuitous ARP shared channel
Keepalived_vrrp[35803]: (VI_1_BACKUP) removing VIPs.
  • 主节点ip6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
vim /data/keepalived/etc/keepalived/keepalived.conf

global_defs {
router_id ip6
#script_user root
#enable_script_security
}
vrrp_script chk_http_port_1 {
script "/data/postgresql16/pg_alive.sh"
interval 10
weight 2
user app
}
vrrp_instance VI_1_MASTER {
state MASTER
interface bond4
# 编号相同
virtual_router_id 50
# 权重相同
priority 50
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
}
virtual_ipaddress {
15.3.69.137
}
track_script {
chk_http_port_1
}
}
  • 从节点ip7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
vim /data/keepalived/etc/keepalived/keepalived.conf

global_defs {
router_id ip7
#script_user root
#enable_script_security
}
vrrp_script chk_http_port_1 {
script "/data/postgresql16/pg_alive.sh"
interval 10
weight 2
user app
}
vrrp_instance VI_1_MASTER {
state MASTER
interface bond4
# 编号相同
virtual_router_id 50
# 权重相同
priority 50
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
}
virtual_ipaddress {
15.3.69.137
}
track_script {
chk_http_port_1
}
}
  • 主节点ip6初始化支撑表
1
2
3
4
5
psql -Upostgres

create table sr_delay(id serial ,sr_date timestamp default now(),new_primary_ip varchar(50),standby_num int);

insert into sr_delay(id,sr_date) values(1,now());
  • pg_alive.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
vim /data/postgresql16/pg_alive.sh

#!/bin/bash
# FILENAME pg_alive.sh
# define DB parameters
PGDATABASE=postgres
PGPORT=5432
PGUSER=postgres
PGHOME=/data/postgresql/bin
LOGFILE=/data/postgresql/pg_alive.log
PATH=$PATH:$PGHOME
keeplognums=30000

# define sql scripts
SQL_UPDATE_HEART="update sr_delay set sr_date = now() where id =1;"
SQL_ROLE="SELECT pg_is_in_recovery from pg_is_in_recovery();"
SQL_CHK_DBALIVED="SELECT 1;"

# process step
db_role=`echo $SQL_ROLE | psql -p $PGPORT -d $PGDATABASE -U $PGUSER -At -w`
if [ $db_role == 't' ];then
echo -e `date +"%F %T"` "Attention1:the current database is standby DB!" >> $LOGFILE
exit 0
fi

#备库不检查存活,主库更新状态
echo $SQL_CHK_DBALIVED | psql -p $PGPORT -d $PGDATABASE -U $PGUSER -At -w
if [ $? -eq 0 ] ;then
echo $SQL_UPDATE_HEART | psql -p $PGPORT -d $PGDATABASE -U $PGUSER -At -w
echo -e `date +"%F %T"` "Success: update the master sr_delay successed!" >> $LOGFILE
exit 0
else
echo -e `date +"%F %T"` "Error:Is the server is running?" >> $LOGFILE
echo -e `date +"%F %T"` "Error:Postgresql is tring stop..." >> $LOGFILE
systemctl stop postgresql >> $LOGFILE 2>&1
sleep 5
echo -e `date +"%F %T"` "Error:keepalived is tring stop..." >> $LOGFILE
#由keepavlived自行切换
#service keepalived stop >>$LOGFILE 2>&1
exit 1
fi
#日志保留 keeplognums 行
if [ ! -f ${MONITOR_LOG} ] ;then touch ${MONITOR_LOG};fi
lognums=`cat ${MONITOR_LOG} |wc -l`
catnum=$((${lognums} -${keeplognums}))
if [[ $lognums -gt ${keeplognums} ]] ; then
sed -i "1,${catnum}d" ${MONITOR_LOG};
fi

备份:

每台 ip6 和 ip6 的 /data/ 下存放 以下2个脚本,并且有执行的权限

每日全量配置数据库数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash

# 设置环境变量
source /etc/profile
source ~/.bash_profile

# 配置 PostgreSQL 连接信息
user=postgres
password="yourpassword"
host=localhost
port=5432 # 默认 PostgreSQL 端口
pg_bin=/data/postgresql16/bin # pg_dump 所在路径
date=$(date +%Y%m%d%H%M)

# 创建备份目录
backup_dir="/data/postgresql_databak/$host/"
if [ ! -d "$backup_dir" ]; then
mkdir -p "$backup_dir"
fi

# 获取所有数据库名称,排除系统数据库和空行
for dbname in $(psql -U $user -h $host -p $port -lqt | awk '{print $1}' | grep -Ev '^(template[01]|postgres|\|)$')
do
# 执行备份
$pg_bin/pg_dump -U $user -h $host -p $port --format=c --compress=9 --file="$backup_dir/$dbname-$date.dump" $dbname
if [ $? -eq 0 ]; then
# 记录日志
log="$backup_dir/${host}_backup.log"
echo "${date} ${dbname} ---backup ok!" | tee -a "$log"
else
echo "${date} ${dbname} ---backup fail!" | tee -a "$log"
fi
done

# 清理超过30天的备份记录
find "$backup_dir" -type f -mtime +30 -exec rm -rf {} \; >/dev/null 2>&1

开启定时备份

1
2
3
4

crontab -e
# 新增
0 1 * * * sh /data/postgresql_backup.sh > /dev/null 2>&1

还原

还原指定库脚本: 如有需要还原的话

注意:

  1. 修改要还原的库的文件

  2. 删除要还原的库,再执行。  之所以删除库没在脚本里是怕误操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/bin/bash

# 设置环境变量
source /etc/profile
source ~/.bash_profile

# 配置 PostgreSQL 连接信息
user=postgres
password="yourpassword"
host=localhost
port=5432 # 默认 PostgreSQL 端口
pg_bin=/data/postgresql16/bin # 或者是你的 pg_dump 所在路径
backup_file="/data/postgresql_databak/$host/nacos-202408291111.dump"
# 创建数据库(如果不存在)
$dbname=nacos
if ! psql -U $user -h $host -p $port -lqt | cut -d \| -f 1 | grep -qw $dbname; then
createdb -U $user -h $host -p $port $dbname
fi

# 还原备份文件
$pg_bin/pg_restore -U $user -h $host -p $port -d $dbname $backup_file

# 检查还原结果
if [ $? -eq 0 ]; then
echo "Restore successful."
else
echo "Restore failed."
fi